-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
83 lines (77 loc) · 1.76 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from google_search import google_search
from scraper import scrape
from constants import LIMIT_WEBSITES
def app(company_name):
search_data = google_search(company_name)
urls = search_data.get('urls', [])
urls = urls
textual_data = []
structured_ca_data = []
currently_scraped = 0
for url in urls:
if currently_scraped == LIMIT_WEBSITES:
break
try:
unstructured_data, structured_data = scrape(url)
if len(unstructured_data) == 0 and len(structured_data) == 0:
continue
if len(unstructured_data) != 0:
textual_data.append({
'url': url,
'data': unstructured_data
})
if len(structured_data) != 0:
structured_ca_data.append({
'url': url,
'data': structured_data
})
currently_scraped += 1
except Exception as err:
print(err)
print(company_name)
print(f"Error occured while scraping {url}")
return textual_data, structured_ca_data
"""
textual_data :
[
{
'url':......,
'data': [
'text1',
'text2',
]
},
{
'url':......,
'data': [
'text1',
'text2',
]
},
]
structured_ca_data :
[
{
'url':......,
'data': [
{
param1: val,
param2: val,
},
{
param1: val,
param2: val,
},
]
},
{
'url':......,
'data': [
{
param1: val,
param2: val,
},
]
},
]
"""