forked from ArrowheadAnalytics/next-gen-scrapy-2.0
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
103 lines (86 loc) · 2.94 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
Author: Sarah Mallepalle (updated by Ethan Douglas to work in Python 3)
Use BeautifulSoup to scrape https://nextgenstats.nfl.com/ for all available extra-large (1200 x 1200)
pass chart images and corresponding pass chart data from the 2017 & 2018 NFL seasons,
and save the image and its corresponding data to the folder 'Pass_Charts'.
Folder format: ./Pass_Charts/[team]/[season]/[week]/{[images], [data]}/[last_name]_[first_name]_[positon].{jpeg, txt}
Example:
Image path = ./Pass_Charts/philadelphia-eagles/2017/super-bowl/images/Foles_Nick_QB.jpeg
Data path = ./Pass_Charts/philadelphia-eagles/2017/super-bowl/data/Foles_Nick_QB.txt
"""
import requests
from bs4 import BeautifulSoup
import re
import json
import urllib
import os
import urllib.request
import cv2
import json
import scipy.misc
import pandas as pd
teams = [#"arizona-cardinals",
#"atlanta-falcons",
#"baltimore-ravens",
#"buffalo-bills",
#"carolina-panthers",
#"chicago-bears",
#"cincinnati-bengals",
#"cleveland-browns",
#"dallas-cowboys",
#"denver-broncos",
#"detroit-lions",
#"green-bay-packers",
#"houston-texans",
#"indianapolis-colts",
#"jacksonville-jaguars",
#"kansas-city-chiefs",
#"los-angeles-chargers",
#"los-angeles-rams",
#"miami-dolphins",
#"minnesota-vikings",
#"new-england-patriots",
#"new-orleans-saints",
"new-york-giants",
"new-york-jets",
"oakland-raiders",
"philadelphia-eagles",
"pittsburgh-steelers",
"san-francisco-49ers",
"seattle-seahawks",
"tampabay-buccaneers",
"tennessee-titans",
"washington-redskins"
]
seasons = ["2019"]
weeks = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "wild-card", "divisional", "conference", "super-bowl"]
pattern = re.compile("charts")
print("Scraping images and html data...")
for team in teams:
for season in seasons:
for week in weeks:
print(team, "\t", season, "\t", week)
URL = "https://nextgenstats.nfl.com/charts/list/pass/" + team + "/" + season + "/" + week
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
script = soup.find_all("script", text=pattern)
contains_charts = json.loads(str(script[0])[33:-131])
if (len(contains_charts["charts"]["charts"]) != 0):
for chart in contains_charts["charts"]["charts"]['charts']:
name = chart["lastName"] + "_" + chart["firstName"] + "_" + chart["position"]
chart["team"] = team
folder = str("Pass_Charts" + os.sep + team + os.sep + season + os.sep + week + os.sep)
img_folder = folder + "images" + os.sep
data_folder = folder + "data" + os.sep
if not os.path.exists(img_folder):
os.makedirs(img_folder)
if not os.path.exists(data_folder):
os.makedirs(data_folder)
img_file = img_folder + name + ".jpeg"
url = "https:" + chart["extraLargeImg"]
urllib.request.urlretrieve(url, img_file)
data_file = data_folder + name + ".txt"
with open(data_file, 'w') as datafile:
json.dump(chart, datafile)
print("Done.")