AdvertiseOnTwitch Video Extraction

from dataclasses import dataclass
from typing import List
import requests
import re
from urllib.parse import urljoin


BASE_URL = "https://advertiseontwitch.com/"


@dataclass
class VideoInfo:
    id: int
    streamer_username: str
    source_url: str
    brand_id: str
    tags: str
    notes: str

    def __str__(self):
        return f"Advertiser: {self.brand_id} | Streamer: {self.streamer_username} | Video URL: {self.source_url}"


def fetch_url(url) -> str:
    response = requests.get(url)
    response.raise_for_status()
    return response.text


def extract_js_urls(html, base_url) -> List[str]:
    js_src_pattern = r'<script[^>]+src=["\'](.*?)["\']'
    js_sources = re.findall(js_src_pattern, html)
    return [urljoin(base_url, js_src) for js_src in js_sources]


def extract_source_paths(js_content) -> List[VideoInfo]:
    video_info_pattern = (
        r'\{id:(\d+),streamer_username:"(.*?)",source_url:"(.*?)",brand_id:"(.*?)",tags:"(.*?)",notes:"(.*?)"\}'
    )
    matches = re.findall(video_info_pattern, js_content)
    return [
        VideoInfo(
            id=int(match[0]),
            streamer_username=match[1],
            source_url=match[2],
            brand_id=match[3],
            tags=match[4],
            notes=match[5]
        )
        for match in matches
    ]


def extract_video_urls(url: str) -> List[VideoInfo]:
    html = fetch_url(url)
    js_urls = extract_js_urls(html, url)
    all_video_info = []
    for js_url in js_urls:
        js_content = fetch_url(js_url)
        video_infos = extract_source_paths(js_content)
        for video_info in video_infos:
            video_info.source_url = urljoin(url, video_info.source_url)
        all_video_info.extend(video_infos)
    return all_video_info


# run the extraction and print the results
video_info_list = extract_video_urls(BASE_URL)
for video_info in video_info_list:
    print(video_info)