ShahidScraper.py

import json
import re
import requests

# Configuration
HEADERS_EN = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en',
    'language': 'EN',
    'origin': 'https://shahid.mbc.net',
    'referer': 'https://shahid.mbc.net/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
}

HEADERS_AR = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'ar',
    'language': 'AR',
    'origin': 'https://shahid.mbc.net',
    'referer': 'https://shahid.mbc.net/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
}

BASE_URL = "https://api3.shahid.net/proxy/v2.1/"

def clean_url(url, source=None):
    """Clean URL and add type=png for logos based on source or URL keywords."""
    if not url:
        return None
    base_url = url.split('?')[0].split('#')[0]
    # Add type=png for logoTitleImage sources or URLs containing logo keywords
    if source == 'logoTitleImage' or any(kw in url.lower() for kw in ['logotitle', 'logo']):
        return f"{base_url}?type=png"
    return base_url

def extract_ids(url):
    """Extract media type and IDs with comprehensive pattern matching."""
    patterns = [
        # Season in series/shows: season-{series_id}-{season_id}
        (r'(?:series|shows)/.*season-(\d+)-(\d+)$', ('season', None)),
        # Series/show without season: series-{id} or show-{id}
        (r'(series|show)-(\d+)$', (lambda x: 'series' if x == 'series' else 'show', None)),
        # Movie pattern
        (r'movie-(\d+)$', ('movie', None)),
        # Season in path: season-{season_id}
        (r'/season-(\d+)$', ('season', None)),
    ]
    
    for pattern, (media_type_func, _) in patterns:
        match = re.search(pattern, url, re.IGNORECASE)
        if match:
            groups = match.groups()
            if isinstance(media_type_func, str):
                media_type = media_type_func
            else:
                media_type = media_type_func(match.group(1))
            
            if media_type == 'season' and len(groups) == 2:
                return 'season', groups[0], groups[1]
            elif media_type == 'season' and len(groups) == 1:
                return 'season', groups[0], None
            else:
                return media_type, groups[-1], None
    
    raise ValueError(f"Unsupported URL format: {url}")

def fetch_media_data(content_id, media_type, season_id=None, headers=HEADERS_EN):
    """Universal data fetcher with proper productType handling."""
    endpoint = 'playableAsset'
    request_params = {'country': 'EG'}
    request_data = {}

    if media_type == 'movie':
        endpoint = 'product/id'
        request_data = {'id': content_id, 'productType': 'MOVIE'}
    elif media_type == 'series':
        request_data = {
            'showId': content_id,
            'productType': 'SERIES',
            'productSubType': 'SERIES'
        }
    elif media_type == 'show':
        request_data = {
            'showId': content_id,
            'productType': 'PROGRAM',
            'productSubType': 'PROGRAM'
        }
    elif media_type == 'season':
        season_identifier = season_id or content_id
        request_data = {
            'seasonId': season_identifier,
            'productType': 'SEASON_SERIES',
            'productSubType': 'SEASON_SERIES'
        }
    else:
        raise ValueError(f"Unsupported media type: {media_type}")

    request_params['request'] = json.dumps(request_data)
    
    response = requests.get(
        f"{BASE_URL}{endpoint}",
        params=request_params,
        headers=headers
    )
    response.raise_for_status()
    return response.json()

def extract_bilingual_info(data_en, data_ar):
    """Extract titles and descriptions from both English and Arabic responses."""
    def get_info(data, lang):
        product = data.get('productModel', {})
        show_info = product.get('show', {})
        season_info = product.get('season', {})
        
        return {
            'title': show_info.get('title') or season_info.get('title') or product.get('title', ''),
            'description': show_info.get('description') or season_info.get('description') 
                          or product.get('description') or product.get('longDescription', '')
        }
    
    return {
        'en': get_info(data_en, 'en'),
        'ar': get_info(data_ar, 'ar')
    }

def extract_artwork(data_ar):
    """Comprehensive artwork extraction from Arabic response."""
    artwork_map = {
        'thumbnailImage': 'Thumbnail',
        'posterImage': 'Poster',
        'heroSliderImage': 'Hero Banner',
        'landscapeClean': 'Landscape Clean',
        'posterClean': 'Clean Poster',
        'logoTitleImage': 'Logo',
        'posterHero': 'Hero Poster',
        'verticalPoster': 'Vertical Poster',
        'posterBundle': 'Bundle Poster',
        'onboardingCategoryImage': 'Category Image'
    }
    
    artworks = []
    seen = set()
    
    def add_artwork(source, url):
        cleaned = clean_url(url, source)
        if cleaned and cleaned not in seen:
            seen.add(cleaned)
            artworks.append((artwork_map.get(source, 'Other'), cleaned))
    
    targets = [
        data_ar.get('productModel', {}),
        data_ar.get('productModel', {}).get('show', {}),
        data_ar.get('productModel', {}).get('season', {}),
        data_ar.get('productModel', {}).get('image', {})
    ]
    
    for target in targets:
        if isinstance(target, dict):
            for key in artwork_map:
                add_artwork(key, target.get(key))
            if target.get('image'):
                for img_key, img_url in target['image'].items():
                    add_artwork(img_key, img_url)
    
    return artworks

def main():
    url = input("Enter Shahid URL: ").strip()
    if not url:
        print("Error: URL input required")
        return
    
    try:
        media_type, content_id, season_id = extract_ids(url)
        
        # Fetch bilingual data
        data_en = fetch_media_data(content_id, media_type, season_id, HEADERS_EN)
        data_ar = fetch_media_data(content_id, media_type, season_id, HEADERS_AR)
        
        if not data_en.get('success') or not data_ar.get('success'):
            print("API request failed")
            return
        
        # Extract and display bilingual info
        info = extract_bilingual_info(data_en, data_ar)
        print("\nEnglish Version:")
        print(f"Title: {info['en']['title']}")
        print(f"Description: {info['en']['description']}\n")
        
        print("Arabic Version:")
        print(f"Title: {info['ar']['title']}")
        print(f"Description: {info['ar']['description']}\n")
        
        # Extract and display artwork
        artworks = extract_artwork(data_ar)
        if artworks:
            print("Artwork URLs:")
            for art_type, url in artworks:
                print(f"{art_type}: {url}")
        else:
            print("No artwork found")
            
    except Exception as e:
        print(f"\nError: {str(e)}")
        print("Troubleshooting steps:")
        print("- Verify URL validity (supported formats: series, show, movie with/without season)")
        print("- Check network connection")
        print("- Try again later if service is unavailable")

if __name__ == "__main__":
    main()