minor improvements to PG&E scraper (#95)

jktomer · web-flow · commit a87f8f71bea5 · 2024-04-16T20:25:04.000-07:00
* actually skip downloads when skipping them
* pge: use mobile site for login it seems to work more reliably
* more PG&amp;E fixes:
* don't wait for page reload after entering credentials; website is now an SPA
and does not reload
* use CSS selectors rather than link text to find billing links for
significant speedup
diff --git a/finance_dl/pge.py b/finance_dl/pge.py
@@ -104,15 +104,15 @@ def login(self):
         if self.logged_in:
             return
         logger.info('Initiating log in')
-        self.driver.get('https://www.pge.com/en/myhome/myaccount/index.page')
+        self.driver.get('https://m.pge.com/')
 
         (username, password), = self.wait_and_return(
             self.find_username_and_password_in_any_frame)
         logger.info('Entering username and password')
         username.send_keys(self.credentials['username'])
         password.send_keys(self.credentials['password'])
-        with self.wait_for_page_load():
-            password.send_keys(Keys.ENTER)
+        password.send_keys(Keys.ENTER)
+        self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory'))
         logger.info('Logged in')
         self.logged_in = True
 
@@ -136,7 +136,7 @@ def process_download(self, download_result, output_dir):
             new_path = self.get_output_path(output_dir, date)
             if os.path.exists(new_path):
                 logger.info('Skipping duplicate download: %s', date)
-                return True
+                return False
             tmp_path = new_path.replace('.pdf', '.tmp.pdf')
             with open(tmp_path, 'wb') as f:
                 download_data = download_result[1]
@@ -157,15 +157,11 @@ def get_bills(self, output_dir):
         actions.send_keys(Keys.ESCAPE)
         actions.perform()
         logger.info('Looking for download link')
-        (bills_link, ), = self.wait_and_return(
-            lambda: self.find_visible_elements_by_descendant_partial_text('BILL & PAYMENT HISTORY', 'h2'))
+        (bills_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory'))
         scrape_lib.retry(lambda: self.click(bills_link), retry_delay=2)
-        (more_link, ), = self.wait_and_return(
-            lambda: self.find_visible_elements_by_descendant_partial_text('View up to 24 months of activity', 'a'))
+        (more_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'href-view-24month-history'))
         scrape_lib.retry(lambda: self.click(more_link), retry_delay=2)
-        links, = self.wait_and_return(
-            lambda: self.find_visible_elements(By.PARTIAL_LINK_TEXT, "View Bill PDF")
-        )
+        links, = self.wait_and_return(lambda: self.find_visible_elements(By.CSS_SELECTOR, ".utag-bill-history-view-bill-pdf"))
 
         for link in links:
             if not self.do_download_from_link(link, output_dir) and self.stop_early: