-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathShahidScraper.py
223 lines (193 loc) · 7.78 KB
/
ShahidScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import json
import re
import requests
# Configuration
HEADERS_EN = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'en',
'language': 'EN',
'origin': 'https://shahid.mbc.net',
'referer': 'https://shahid.mbc.net/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
}
HEADERS_AR = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'ar',
'language': 'AR',
'origin': 'https://shahid.mbc.net',
'referer': 'https://shahid.mbc.net/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
}
BASE_URL = "https://api3.shahid.net/proxy/v2.1/"
def clean_url(url, source=None):
"""Clean URL and add type=png for logos based on source or URL keywords."""
if not url:
return None
base_url = url.split('?')[0].split('#')[0]
# Add type=png for logoTitleImage sources or URLs containing logo keywords
if source == 'logoTitleImage' or any(kw in url.lower() for kw in ['logotitle', 'logo']):
return f"{base_url}?type=png"
return base_url
def extract_ids(url):
"""Extract media type and IDs with comprehensive pattern matching."""
patterns = [
# Season in series/shows: season-{series_id}-{season_id}
(r'(?:series|shows)/.*season-(\d+)-(\d+)$', ('season', None)),
# Series/show without season: series-{id} or show-{id}
(r'(series|show)-(\d+)$', (lambda x: 'series' if x == 'series' else 'show', None)),
# Movie pattern
(r'movie-(\d+)$', ('movie', None)),
# Season in path: season-{season_id}
(r'/season-(\d+)$', ('season', None)),
]
for pattern, (media_type_func, _) in patterns:
match = re.search(pattern, url, re.IGNORECASE)
if match:
groups = match.groups()
if isinstance(media_type_func, str):
media_type = media_type_func
else:
media_type = media_type_func(match.group(1))
if media_type == 'season' and len(groups) == 2:
return 'season', groups[0], groups[1]
elif media_type == 'season' and len(groups) == 1:
return 'season', groups[0], None
else:
return media_type, groups[-1], None
raise ValueError(f"Unsupported URL format: {url}")
def fetch_media_data(content_id, media_type, season_id=None, headers=HEADERS_EN):
"""Universal data fetcher with proper productType handling."""
endpoint = 'playableAsset'
request_params = {'country': 'EG'}
request_data = {}
if media_type == 'movie':
endpoint = 'product/id'
request_data = {'id': content_id, 'productType': 'MOVIE'}
elif media_type == 'series':
request_data = {
'showId': content_id,
'productType': 'SERIES',
'productSubType': 'SERIES'
}
elif media_type == 'show':
request_data = {
'showId': content_id,
'productType': 'PROGRAM',
'productSubType': 'PROGRAM'
}
elif media_type == 'season':
season_identifier = season_id or content_id
request_data = {
'seasonId': season_identifier,
'productType': 'SEASON_SERIES',
'productSubType': 'SEASON_SERIES'
}
else:
raise ValueError(f"Unsupported media type: {media_type}")
request_params['request'] = json.dumps(request_data)
response = requests.get(
f"{BASE_URL}{endpoint}",
params=request_params,
headers=headers
)
response.raise_for_status()
return response.json()
def extract_bilingual_info(data_en, data_ar):
"""Extract titles and descriptions from both English and Arabic responses."""
def get_info(data, lang):
product = data.get('productModel', {})
show_info = product.get('show', {})
season_info = product.get('season', {})
return {
'title': show_info.get('title') or season_info.get('title') or product.get('title', ''),
'description': show_info.get('description') or season_info.get('description')
or product.get('description') or product.get('longDescription', '')
}
return {
'en': get_info(data_en, 'en'),
'ar': get_info(data_ar, 'ar')
}
def extract_artwork(data_ar):
"""Comprehensive artwork extraction from Arabic response."""
artwork_map = {
'thumbnailImage': 'Thumbnail',
'posterImage': 'Poster',
'heroSliderImage': 'Hero Banner',
'landscapeClean': 'Landscape Clean',
'posterClean': 'Clean Poster',
'logoTitleImage': 'Logo',
'posterHero': 'Hero Poster',
'verticalPoster': 'Vertical Poster',
'posterBundle': 'Bundle Poster',
'onboardingCategoryImage': 'Category Image'
}
artworks = []
seen = set()
def add_artwork(source, url):
cleaned = clean_url(url, source)
if cleaned and cleaned not in seen:
seen.add(cleaned)
artworks.append((artwork_map.get(source, 'Other'), cleaned))
targets = [
data_ar.get('productModel', {}),
data_ar.get('productModel', {}).get('show', {}),
data_ar.get('productModel', {}).get('season', {}),
data_ar.get('productModel', {}).get('image', {})
]
for target in targets:
if isinstance(target, dict):
for key in artwork_map:
add_artwork(key, target.get(key))
if target.get('image'):
for img_key, img_url in target['image'].items():
add_artwork(img_key, img_url)
return artworks
def main():
url = input("Enter Shahid URL: ").strip()
if not url:
print("Error: URL input required")
return
try:
media_type, content_id, season_id = extract_ids(url)
# Fetch bilingual data
data_en = fetch_media_data(content_id, media_type, season_id, HEADERS_EN)
data_ar = fetch_media_data(content_id, media_type, season_id, HEADERS_AR)
if not data_en.get('success') or not data_ar.get('success'):
print("API request failed")
return
# Extract and display bilingual info
info = extract_bilingual_info(data_en, data_ar)
print("\nEnglish Version:")
print(f"Title: {info['en']['title']}")
print(f"Description: {info['en']['description']}\n")
print("Arabic Version:")
print(f"Title: {info['ar']['title']}")
print(f"Description: {info['ar']['description']}\n")
# Extract and display artwork
artworks = extract_artwork(data_ar)
if artworks:
print("Artwork URLs:")
for art_type, url in artworks:
print(f"{art_type}: {url}")
else:
print("No artwork found")
except Exception as e:
print(f"\nError: {str(e)}")
print("Troubleshooting steps:")
print("- Verify URL validity (supported formats: series, show, movie with/without season)")
print("- Check network connection")
print("- Try again later if service is unavailable")
if __name__ == "__main__":
main()