From 4bacc5c74c244b25d9cba79a1fcee207553bb55e Mon Sep 17 00:00:00 2001 From: arition Date: Thu, 2 Nov 2023 12:43:59 -0400 Subject: [PATCH 1/5] fix csrf token fetching (#92) --- finance_dl/paypal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finance_dl/paypal.py b/finance_dl/paypal.py index 31b120d..3148c38 100644 --- a/finance_dl/paypal.py +++ b/finance_dl/paypal.py @@ -175,8 +175,8 @@ def get_csrf_token(self): logging.info('Getting CSRF token') self.driver.get('https://www.paypal.com/myaccount/transactions/') # Get CSRF token - body_element, = self.wait_and_locate((By.ID, "__react_data__")) - attribute_object = json.loads(body_element.get_attribute("data")) + body_element, = self.wait_and_locate((By.ID, "__APP_DATA__")) + attribute_object = json.loads(body_element.get_attribute("innerHTML")) self.csrf_token = attribute_object["_csrf"] return self.csrf_token From 196faef778cebb3ceb598fb9f22b0a494652e266 Mon Sep 17 00:00:00 2001 From: Eugeniu Plamadeala Date: Sun, 12 Nov 2023 22:20:36 -0800 Subject: [PATCH 2/5] venmo: Login fix, month at a time transaction downloads (#84) * Catch up with Sign In page UI changes where the password field is interactable only after the username is submitted. Additionally, use a different mechanism to wait for page loading because it was failing. * venmo: retrieve transactions one month at a time (code by chandler150). --- finance_dl/venmo.py | 83 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 13 deletions(-) diff --git a/finance_dl/venmo.py b/finance_dl/venmo.py index 6f576e5..bff76ea 100644 --- a/finance_dl/venmo.py +++ b/finance_dl/venmo.py @@ -82,7 +82,7 @@ def CONFIG_venmo(): import os import time from selenium.webdriver.common.by import By -from selenium.common.exceptions import NoSuchElementException +from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, StaleElementReferenceException from selenium.webdriver.support.ui import Select from selenium.webdriver.common.keys import Keys @@ -146,19 +146,67 @@ def __init__(self, credentials, output_directory, def check_after_wait(self): check_url(self.driver.current_url) + def find_venmo_username(self): + for frame in self.for_each_frame(): + try: + return self.driver.find_elements(By.XPATH, '//input[@type="text" or @type="email"]') + except NoSuchElementException: + pass + raise NoSuchElementException() + + def find_venmo_password(self): + for frame in self.for_each_frame(): + try: + return self.driver.find_elements(By.XPATH, '//input[@type="password"]') + except NoSuchElementException: + pass + raise NoSuchElementException() + + def wait_for(self, condition_function): + start_time = time.time() + while time.time() < start_time + 3: + if condition_function(): + return True + else: + time.sleep(0.1) + raise Exception( + 'Timeout waiting for {}'.format(condition_function.__name__) + ) + + def click_through_to_new_page(self, button_text): + link = self.driver.find_element(By.XPATH, f'//button[@name="{button_text}"]') + link.click() + + def link_has_gone_stale(): + try: + # poll the link with an arbitrary call + link.find_elements(By.XPATH, 'doesnt-matter') + return False + except StaleElementReferenceException: + return True + + self.wait_for(link_has_gone_stale) + def login(self): if self.logged_in: return logger.info('Initiating log in') self.driver.get('https://venmo.com/account/sign-in') - (username, password), = self.wait_and_return( - self.find_username_and_password_in_any_frame) - logger.info('Entering username and password') - username.send_keys(self.credentials['username']) + #(username, password), = self.wait_and_return( + # self.find_username_and_password_in_any_frame) + username = self.wait_and_return(self.find_venmo_username)[0][0] + try: + logger.info('Entering username') + username.send_keys(self.credentials['username']) + username.send_keys(Keys.ENTER) + except ElementNotInteractableException: + # indicates that username already filled in + logger.info("Skipped") + password = self.wait_and_return(self.find_venmo_password)[0][0] + logger.info('Entering password') password.send_keys(self.credentials['password']) - with self.wait_for_page_load(): - password.send_keys(Keys.ENTER) + self.click_through_to_new_page("Sign in") logger.info('Logged in') self.logged_in = True @@ -173,7 +221,7 @@ def goto_statement(self, start_date, end_date): def download_csv(self): logger.info('Looking for CSV link') download_button, = self.wait_and_locate( - (By.XPATH, '//a[text() = "Download CSV"]')) + (By.XPATH, '//*[text() = "Download CSV"]')) self.click(download_button) logger.info('Waiting for CSV download') download_result, = self.wait_and_return(self.get_downloaded_file) @@ -182,8 +230,8 @@ def download_csv(self): def get_balance(self, balance_type): try: - balance_node = self.driver.find_element( - By.XPATH, '//*[@class="%s"]/child::*[@class="balance-amt"]' % + balance_node = self.driver.find_element( + By.XPATH, '//*[text() = "%s"]/following-sibling::*' % balance_type) return balance_node.text except NoSuchElementException: @@ -191,9 +239,11 @@ def get_balance(self, balance_type): def get_balances(self): def maybe_get_balance(): - start_balance = self.get_balance('start-balance') - end_balance = self.get_balance('end-balance') + start_balance = self.get_balance('Beginning amount') + end_balance = self.get_balance('Ending amount') if start_balance is not None and end_balance is not None: + start_balance = start_balance.replace("\n", "") + end_balance = end_balance.replace("\n", "") return (start_balance, end_balance) try: error_node = self.driver.find_element( @@ -303,13 +353,20 @@ def fetch_history(self): while start_date <= self.latest_history_date: end_date = min(self.latest_history_date, - start_date + datetime.timedelta(days=89)) + self.last_day_of_month(start_date)) self.fetch_statement(start_date, end_date) start_date = end_date + datetime.timedelta(days=1) logger.debug('Venmo hack: waiting 5 seconds between requests') time.sleep(5) + + def last_day_of_month(self, any_day): + # The day 28 exists in every month. 4 days later, it's always next month + next_month = any_day.replace(day=28) + datetime.timedelta(days=4) + # subtracting the number of the current day brings us back one month + return next_month - datetime.timedelta(days=next_month.day) + def run(self): self.login() self.fetch_history() From 6720650d5ee74bd806903e6e585eab2427a9e626 Mon Sep 17 00:00:00 2001 From: Jonathan Klabunde Tomer Date: Tue, 16 Apr 2024 20:22:19 -0700 Subject: [PATCH 3/5] amazon: update css selector for transaction history window dropdown (#97) --- finance_dl/amazon.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/finance_dl/amazon.py b/finance_dl/amazon.py index d6340dc..a68d76f 100644 --- a/finance_dl/amazon.py +++ b/finance_dl/amazon.py @@ -445,9 +445,7 @@ def retrieve_all_order_groups(): order_select_index = 0 while True: - (order_filter,), = self.wait_and_return( - lambda: self.find_visible_elements(By.XPATH, '//select[@name="orderFilter"]') - ) + order_filter, = self.wait_and_locate((By.CSS_SELECTOR, '#time-filter, #orderFilter')) order_select = Select(order_filter) num_options = len(order_select.options) if order_select_index >= num_options: From dca4505b7354f689d33ce2cf0d31e9e9ad4c364c Mon Sep 17 00:00:00 2001 From: Jonathan Klabunde Tomer Date: Tue, 16 Apr 2024 20:23:06 -0700 Subject: [PATCH 4/5] fix: ignore `.com.google.Chrome.*` files in download dir (#96) seems these now get written during in-progress downloads, confusing scrape_lib's detection of the download being finished and resulting in truncated downloads. --- finance_dl/scrape_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finance_dl/scrape_lib.py b/finance_dl/scrape_lib.py index fa4ca58..cfb4e1d 100644 --- a/finance_dl/scrape_lib.py +++ b/finance_dl/scrape_lib.py @@ -195,7 +195,7 @@ def get_downloaded_file(self): partial_names = [] other_names = [] for name in names: - if name.endswith('.part') or name.endswith('.crdownload'): + if name.endswith('.part') or name.endswith('.crdownload') or name.startswith('.com.google.Chrome'): partial_names.append(name) else: other_names.append(name) From a87f8f71bea5498fd20e72c78a308a79723f3f03 Mon Sep 17 00:00:00 2001 From: Jonathan Klabunde Tomer Date: Tue, 16 Apr 2024 20:25:04 -0700 Subject: [PATCH 5/5] minor improvements to PG&E scraper (#95) * actually skip downloads when skipping them * pge: use mobile site for login it seems to work more reliably * more PG&E fixes: * don't wait for page reload after entering credentials; website is now an SPA and does not reload * use CSS selectors rather than link text to find billing links for significant speedup --- finance_dl/pge.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/finance_dl/pge.py b/finance_dl/pge.py index dcf2a8f..689390d 100644 --- a/finance_dl/pge.py +++ b/finance_dl/pge.py @@ -104,15 +104,15 @@ def login(self): if self.logged_in: return logger.info('Initiating log in') - self.driver.get('https://www.pge.com/en/myhome/myaccount/index.page') + self.driver.get('https://m.pge.com/') (username, password), = self.wait_and_return( self.find_username_and_password_in_any_frame) logger.info('Entering username and password') username.send_keys(self.credentials['username']) password.send_keys(self.credentials['password']) - with self.wait_for_page_load(): - password.send_keys(Keys.ENTER) + password.send_keys(Keys.ENTER) + self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory')) logger.info('Logged in') self.logged_in = True @@ -136,7 +136,7 @@ def process_download(self, download_result, output_dir): new_path = self.get_output_path(output_dir, date) if os.path.exists(new_path): logger.info('Skipping duplicate download: %s', date) - return True + return False tmp_path = new_path.replace('.pdf', '.tmp.pdf') with open(tmp_path, 'wb') as f: download_data = download_result[1] @@ -157,15 +157,11 @@ def get_bills(self, output_dir): actions.send_keys(Keys.ESCAPE) actions.perform() logger.info('Looking for download link') - (bills_link, ), = self.wait_and_return( - lambda: self.find_visible_elements_by_descendant_partial_text('BILL & PAYMENT HISTORY', 'h2')) + (bills_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory')) scrape_lib.retry(lambda: self.click(bills_link), retry_delay=2) - (more_link, ), = self.wait_and_return( - lambda: self.find_visible_elements_by_descendant_partial_text('View up to 24 months of activity', 'a')) + (more_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'href-view-24month-history')) scrape_lib.retry(lambda: self.click(more_link), retry_delay=2) - links, = self.wait_and_return( - lambda: self.find_visible_elements(By.PARTIAL_LINK_TEXT, "View Bill PDF") - ) + links, = self.wait_and_return(lambda: self.find_visible_elements(By.CSS_SELECTOR, ".utag-bill-history-view-bill-pdf")) for link in links: if not self.do_download_from_link(link, output_dir) and self.stop_early: