Skip to content

Commit

Permalink
Export 07-05-21
Browse files Browse the repository at this point in the history
  • Loading branch information
shaansubbaiah committed May 7, 2021
1 parent d70c26d commit 101f62c
Show file tree
Hide file tree
Showing 7 changed files with 35,634 additions and 27,850 deletions.
8,896 changes: 0 additions & 8,896 deletions export/07-05-21/recipes.csv

This file was deleted.

8,895 changes: 0 additions & 8,895 deletions export/07-05-21/recipes.jsonl

This file was deleted.

10,058 changes: 0 additions & 10,058 deletions export/07-05-21/scrapelog.txt

This file was deleted.

35,517 changes: 35,517 additions & 0 deletions export/scraped-07-05-21.csv

Large diffs are not rendered by default.

94 changes: 94 additions & 0 deletions extras/combine_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import pandas as pd

df1 = pd.read_csv('1.csv')

print(df1.head())
print(len(df1))

df2 = pd.read_csv('2.csv')

print(df2.head())
print(len(df2))

df3 = pd.read_csv('3.csv')

print(df3.head())
print(len(df3))

print(f'total: {len(df1)+len(df2)+len(df3)}')

df4 = pd.concat([df1, df2, df3])
print(df4.head())
combined_len = len(df4)
print(combined_len)

df4 = df4.drop_duplicates()
print(df4.head())
unique_len = len(df4)
print(unique_len)
print(f'unique% = {unique_len/combined_len*100}')

df4.to_csv('combined.csv', index=False)

df5 = pd.read_csv('combined.csv')
print(df5.head())
print(len(df5))

# # Scraped from /recipes
# name ... omega_6_fatty_acid_g
# 0 Simple Macaroni and Cheese ... NaN
# 1 Gourmet Mushroom Risotto ... NaN
# 2 Dessert Crepes ... NaN
# 3 Pork Steaks ... NaN
# 4 Quick and Easy Pizza Crust ... NaN

# [5 rows x 47 columns]
# 8895

# # Scraped from every other main category
# name ... omega_6_fatty_acid_g
# 0 Dessert Crepes ... NaN
# 1 Thin-Crust Pizza Dough ... NaN
# 2 Chocolate-Covered Raspberry Brownies ... NaN
# 3 Käsesahnetorte(German Yogurt Mousse Cake) ... NaN
# 4 Brazilian Cheese Bread(Pao de Queijo) ... NaN

# [5 rows x 47 columns]
# 29188

# # Scraped from appetizers-and-snacks category
# name ... omega_6_fatty_acid_g
# 0 Perfect Pot Stickers ... NaN
# 1 Dessert Crepes ... NaN
# 2 Curried Chicken Lettuce Wraps ... NaN
# 3 Best Ever Crab Cakes ... NaN
# 4 Scrumptious Salmon Cakes ... NaN

# [5 rows x 47 columns]
# 8062

# # Total rows, main + appetizer category + other categories
# total: 46145

# # Combined CSV
# name ... omega_6_fatty_acid_g
# 0 Simple Macaroni and Cheese ... NaN
# 1 Gourmet Mushroom Risotto ... NaN
# 2 Dessert Crepes ... NaN
# 3 Pork Steaks ... NaN
# 4 Quick and Easy Pizza Crust ... NaN

# [5 rows x 47 columns]
# 46145

# # Combined CSV without duplicates
# name ... omega_6_fatty_acid_g
# 0 Simple Macaroni and Cheese ... NaN
# 1 Gourmet Mushroom Risotto ... NaN
# 2 Dessert Crepes ... NaN
# 3 Pork Steaks ... NaN
# 4 Quick and Easy Pizza Crust ... NaN

# [5 rows x 47 columns]
# 35516
# unique % = 76.96608516632355
File renamed without changes.
24 changes: 23 additions & 1 deletion recipescrape/spiders/recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,29 @@ class RecipeSpider(CrawlSpider):
name = 'recipes'
allowed_domains = ['allrecipes.com']

start_urls = ['https://www.allrecipes.com/recipes/?page=2']
start_urls = [
'https://www.allrecipes.com/recipes/?page=2',
'https://www.allrecipes.com/recipes/76/appetizers-and-snacks/?page=2',
'https://www.allrecipes.com/recipes/88/bbq-grilling/?page=2',
'https://www.allrecipes.com/recipes/156/bread/?page=2',
'https://www.allrecipes.com/recipes/78/breakfast-and-brunch/?page=2',
'https://www.allrecipes.com/recipes/79/desserts/?page=2',
'https://www.allrecipes.com/recipes/17562/dinner/?page=2',
'https://www.allrecipes.com/recipes/1642/everyday-cooking/?page=2',
'https://www.allrecipes.com/recipes/84/healthy-recipes/?page=2',
'https://www.allrecipes.com/recipes/85/holidays-and-events/?page=2',
'https://www.allrecipes.com/recipes/17567/ingredients/?page=2',
'https://www.allrecipes.com/recipes/17561/lunch/?page=2',
'https://www.allrecipes.com/recipes/80/main-dish/?page=2',
'https://www.allrecipes.com/recipes/92/meat-and-poultry/?page=2',
'https://www.allrecipes.com/recipes/95/pasta-and-noodles/?page=2',
'https://www.allrecipes.com/recipes/96/salad/?page=2',
'https://www.allrecipes.com/recipes/93/seafood/?page=2',
'https://www.allrecipes.com/recipes/81/side-dish/?page=2',
'https://www.allrecipes.com/recipes/94/soups-stews-and-chili/?page=2',
'https://www.allrecipes.com/recipes/236/us-recipes/?page=2',
'https://www.allrecipes.com/recipes/86/world-cuisine/?page=2',
]

custom_settings = {
'FEED_EXPORT_FIELDS': FIELDS,
Expand Down

0 comments on commit 101f62c

Please sign in to comment.