Advertisement
justinooo

AdvertiseOnTwitch Video Extraction

Oct 31st, 2024 (edited)
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.92 KB | None | 0 0
  1. from dataclasses import dataclass
  2. from typing import List
  3. import requests
  4. import re
  5. from urllib.parse import urljoin
  6.  
  7.  
  8. BASE_URL = "https://advertiseontwitch.com/"
  9.  
  10.  
  11. @dataclass
  12. class VideoInfo:
  13.     id: int
  14.     streamer_username: str
  15.     source_url: str
  16.     brand_id: str
  17.     tags: str
  18.     notes: str
  19.  
  20.     def __str__(self):
  21.         return f"Advertiser: {self.brand_id} | Streamer: {self.streamer_username} | Video URL: {self.source_url}"
  22.  
  23.  
  24. def fetch_url(url) -> str:
  25.     response = requests.get(url)
  26.     response.raise_for_status()
  27.     return response.text
  28.  
  29.  
  30. def extract_js_urls(html, base_url) -> List[str]:
  31.     js_src_pattern = r'<script[^>]+src=["\'](.*?)["\']'
  32.     js_sources = re.findall(js_src_pattern, html)
  33.     return [urljoin(base_url, js_src) for js_src in js_sources]
  34.  
  35.  
  36. def extract_source_paths(js_content) -> List[VideoInfo]:
  37.     video_info_pattern = (
  38.         r'\{id:(\d+),streamer_username:"(.*?)",source_url:"(.*?)",brand_id:"(.*?)",tags:"(.*?)",notes:"(.*?)"\}'
  39.     )
  40.     matches = re.findall(video_info_pattern, js_content)
  41.     return [
  42.         VideoInfo(
  43.             id=int(match[0]),
  44.             streamer_username=match[1],
  45.             source_url=match[2],
  46.             brand_id=match[3],
  47.             tags=match[4],
  48.             notes=match[5]
  49.         )
  50.         for match in matches
  51.     ]
  52.  
  53.  
  54. def extract_video_urls(url: str) -> List[VideoInfo]:
  55.     html = fetch_url(url)
  56.     js_urls = extract_js_urls(html, url)
  57.     all_video_info = []
  58.     for js_url in js_urls:
  59.         js_content = fetch_url(js_url)
  60.         video_infos = extract_source_paths(js_content)
  61.         for video_info in video_infos:
  62.             video_info.source_url = urljoin(url, video_info.source_url)
  63.         all_video_info.extend(video_infos)
  64.     return all_video_info
  65.  
  66.  
  67. # run the extraction and print the results
  68. video_info_list = extract_video_urls(BASE_URL)
  69. for video_info in video_info_list:
  70.     print(video_info)
  71.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement