Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from dataclasses import dataclass
- from typing import List
- import requests
- import re
- from urllib.parse import urljoin
- BASE_URL = "https://advertiseontwitch.com/"
- @dataclass
- class VideoInfo:
- id: int
- streamer_username: str
- source_url: str
- brand_id: str
- tags: str
- notes: str
- def __str__(self):
- return f"Advertiser: {self.brand_id} | Streamer: {self.streamer_username} | Video URL: {self.source_url}"
- def fetch_url(url) -> str:
- response = requests.get(url)
- response.raise_for_status()
- return response.text
- def extract_js_urls(html, base_url) -> List[str]:
- js_src_pattern = r'<script[^>]+src=["\'](.*?)["\']'
- js_sources = re.findall(js_src_pattern, html)
- return [urljoin(base_url, js_src) for js_src in js_sources]
- def extract_source_paths(js_content) -> List[VideoInfo]:
- video_info_pattern = (
- r'\{id:(\d+),streamer_username:"(.*?)",source_url:"(.*?)",brand_id:"(.*?)",tags:"(.*?)",notes:"(.*?)"\}'
- )
- matches = re.findall(video_info_pattern, js_content)
- return [
- VideoInfo(
- id=int(match[0]),
- streamer_username=match[1],
- source_url=match[2],
- brand_id=match[3],
- tags=match[4],
- notes=match[5]
- )
- for match in matches
- ]
- def extract_video_urls(url: str) -> List[VideoInfo]:
- html = fetch_url(url)
- js_urls = extract_js_urls(html, url)
- all_video_info = []
- for js_url in js_urls:
- js_content = fetch_url(js_url)
- video_infos = extract_source_paths(js_content)
- for video_info in video_infos:
- video_info.source_url = urljoin(url, video_info.source_url)
- all_video_info.extend(video_infos)
- return all_video_info
- # run the extraction and print the results
- video_info_list = extract_video_urls(BASE_URL)
- for video_info in video_info_list:
- print(video_info)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement