Skip to content

Commit a87f8f7

Browse files
authored
minor improvements to PG&E scraper (#95)
* actually skip downloads when skipping them * pge: use mobile site for login it seems to work more reliably * more PG&E fixes: * don't wait for page reload after entering credentials; website is now an SPA and does not reload * use CSS selectors rather than link text to find billing links for significant speedup
1 parent dca4505 commit a87f8f7

File tree

1 file changed

+7
-11
lines changed

1 file changed

+7
-11
lines changed

finance_dl/pge.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -104,15 +104,15 @@ def login(self):
104104
if self.logged_in:
105105
return
106106
logger.info('Initiating log in')
107-
self.driver.get('https://www.pge.com/en/myhome/myaccount/index.page')
107+
self.driver.get('https://m.pge.com/')
108108

109109
(username, password), = self.wait_and_return(
110110
self.find_username_and_password_in_any_frame)
111111
logger.info('Entering username and password')
112112
username.send_keys(self.credentials['username'])
113113
password.send_keys(self.credentials['password'])
114-
with self.wait_for_page_load():
115-
password.send_keys(Keys.ENTER)
114+
password.send_keys(Keys.ENTER)
115+
self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory'))
116116
logger.info('Logged in')
117117
self.logged_in = True
118118

@@ -136,7 +136,7 @@ def process_download(self, download_result, output_dir):
136136
new_path = self.get_output_path(output_dir, date)
137137
if os.path.exists(new_path):
138138
logger.info('Skipping duplicate download: %s', date)
139-
return True
139+
return False
140140
tmp_path = new_path.replace('.pdf', '.tmp.pdf')
141141
with open(tmp_path, 'wb') as f:
142142
download_data = download_result[1]
@@ -157,15 +157,11 @@ def get_bills(self, output_dir):
157157
actions.send_keys(Keys.ESCAPE)
158158
actions.perform()
159159
logger.info('Looking for download link')
160-
(bills_link, ), = self.wait_and_return(
161-
lambda: self.find_visible_elements_by_descendant_partial_text('BILL & PAYMENT HISTORY', 'h2'))
160+
(bills_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory'))
162161
scrape_lib.retry(lambda: self.click(bills_link), retry_delay=2)
163-
(more_link, ), = self.wait_and_return(
164-
lambda: self.find_visible_elements_by_descendant_partial_text('View up to 24 months of activity', 'a'))
162+
(more_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'href-view-24month-history'))
165163
scrape_lib.retry(lambda: self.click(more_link), retry_delay=2)
166-
links, = self.wait_and_return(
167-
lambda: self.find_visible_elements(By.PARTIAL_LINK_TEXT, "View Bill PDF")
168-
)
164+
links, = self.wait_and_return(lambda: self.find_visible_elements(By.CSS_SELECTOR, ".utag-bill-history-view-bill-pdf"))
169165

170166
for link in links:
171167
if not self.do_download_from_link(link, output_dir) and self.stop_early:

0 commit comments

Comments
 (0)