Skip to content

Latest commit

 

History

History
157 lines (118 loc) · 3.88 KB

Scrape_Clean_Store.md

File metadata and controls

157 lines (118 loc) · 3.88 KB
import sqlite3
from requests_html import HTMLSession
from fnmatch import fnmatch
from glom import glom
from openpyxl import load_workbook
from io import BytesIO
from itertools import islice
session = HTMLSession()
r = session.get('https://data.nsw.gov.au/data/dataset/fuel-check')

The links(xlsx files) are in a json file.
First step is to get the json file from the page, download it and extract all the xlsx files

#copy the xpath path from the webpage
#I used google chrome dev tools
#right click on the relevant link, click on copy element, and select full xpath
sel = '/html/body/div[3]/div/div[2]/div/article/div/ul/li[1]/a/@href'

#access the json url
json_link = r.html.xpath(sel)[0]

#Step 3: append json link to 'https://data.nsw.gov.au' and pass again to session.get
url = F'https://data.nsw.gov.au{json_link}'
json_data = session.get(url).json()

The links are in the url key.
The path to the link is : json_data -> result -> resources -> url
we'll filter urls for only those that end with 'xslx'
the filter could also be on the format key that has 'xlsx' in its value

#create a specification to glom the relevant data
#glom's documentation is pretty detailed
spec = {'name' : ('result.resources',['name']),
        'url':('result.resources',['url'])        
       }

#extract data
outcome = glom(json_data,spec)

#get the names to be the keys, and urls to be the values
outcome = dict(zip(*outcome.values()))

#keep only entries where the file ends with 'xlsx'
#we'll use the fnmatch function from the fnmatch module
pattern = '*.xlsx'

urls = {key:value 
        for key,value in outcome.items()
        if fnmatch(value,pattern)
       }
#connect to database
con = sqlite3.connect('fuel_nsw_price_data.db')
cur = con.cursor()

query = f'''SELECT fuel_month_year 
           from fuel_month_year
           ;'''

cur.execute(query)
#get current list of keys
db_table_list = cur.fetchall()
#get only urls that have not been downloaded
download_keys = set(urls.keys()) - set(db[0] for db in db_table_list)

def key_generator():
    for key in download_keys:
        yield(key,)

#add urls not downloaded to database
insert_query = f'INSERT INTO fuel_month_year (fuel_month_year) VALUES (?);'
cur.executemany(insert_query,key_generator())
#never forget to commit
con.commit()
#and cose connection
con.close()
#download list for our data
url_download_list = [urls[key] for key in download_keys]
print(len(url_download_list))
0
#no transformations needed here
#simply download and store in database
#pandas not needed
#plus openpyxl is faster than pandas for this task
collection = []

for xlsx_link in url_download_list:
    #download link
    r = session.get(xlsx_link)
    #read it in
    #note that they are bytes, hence the BytesIO usage
    wb = load_workbook(BytesIO(r.content),read_only=True)
    #initialize the first sheet
    ws = wb.active
    
    #get index of header
    index,header = next((index,value)
                        for index,value in enumerate(ws.values)
                        if 'ServiceStationName' in value)
    
    #get rest of data
    body = list(islice(ws.values,index+1,None))

   
    #store away the data
    collection.extend(body)
#connect to database again
con = sqlite3.connect('fuel_nsw_price_data.db')
cur = con.cursor()
#we have eight columns in our database table
#and in the incoming data
#so eight question marks to represent our data
insert_params = ','.join(['?']*8)

#list of columns
columns_list = ','.join(['service_station_name','address','suburb','postcode','brand','fuelcode','transaction_date','price'])

#query string
insert_query = f"INSERT INTO fuel_data({columns_list}) VALUES ({insert_params});"

#speedy multiple row insertions
cur.executemany(insert_query, collection)

#without this, the information will not be saved
con.commit()

con.close()