Advertisement
Najeebsk

SCRAPE-AND-DOWNLOAD.pyw

Apr 16th, 2025 (edited)
332
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 36.85 KB | None | 0 0
  1. import tkinter as tk
  2. from tkinter import END
  3. from tkinter import ttk, messagebox, filedialog
  4. import webbrowser
  5. import os
  6. import requests
  7. from bs4 import BeautifulSoup
  8. from urllib.parse import urljoin, urlparse
  9. import yt_dlp
  10. import subprocess
  11. from PIL import Image, ImageTk
  12. import io
  13. import threading
  14. import shutil
  15. # ------------------------------
  16. from tkinter import scrolledtext
  17. stop_download_flag = False
  18. #================ADD-IMAGE-ICON=================
  19. import sys
  20.  
  21. def resource_path(relative_path):
  22.     """ Get the absolute path to the resource, works for PyInstaller. """
  23.     if getattr(sys, '_MEIPASS', False):
  24.         return os.path.join(sys._MEIPASS, relative_path)
  25.     return os.path.join(os.path.abspath("."), relative_path)
  26.  
  27. # Use this function to load files:
  28. #splash_image = resource_path("splash-1.png")
  29. icon_path = resource_path("D.ico")
  30. #================ADD-IMAGE-ICON=================
  31.  
  32. # Register browsers with full path
  33. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  34. firefox_path = r"C:\Program Files\Mozilla Firefox\firefox.exe"
  35. if os.path.exists(chrome_path):
  36.     webbrowser.register("chrome", None, webbrowser.BackgroundBrowser(chrome_path))
  37. if os.path.exists(firefox_path):
  38.     webbrowser.register("firefox", None, webbrowser.BackgroundBrowser(firefox_path))
  39.  
  40. # === Main Window ===
  41. window = tk.Tk()
  42. window.title("NAJEEB SHAH KHAN SCRAPE WEB & Image Search Tool & Media Downloader")
  43. window.geometry("965x700")
  44. #window.configure(bg="#2c3e50")
  45. window.iconbitmap(icon_path)
  46.  
  47. notebook = ttk.Notebook(window)
  48. tab1 = ttk.Frame(notebook)
  49. tab2 = ttk.Frame(notebook)
  50. notebook.add(tab1, text="Image Search Tool")
  51. notebook.add(tab2, text="Media Downloader")
  52. notebook.pack(expand=True, fill="both")
  53.  
  54. # ====================
  55. # === Tab 1 Content ===
  56. # ====================
  57. dark_mode_var = tk.BooleanVar()
  58. keyword_var = tk.StringVar()
  59. site_var = tk.StringVar()
  60. extra_format_var = tk.StringVar()
  61. query_preview_var = tk.StringVar()
  62. browser_var = tk.StringVar(value="default")
  63. format_vars = {
  64.     "jpg": tk.BooleanVar(value=True),
  65.     "png": tk.BooleanVar(value=True),
  66.     "gif": tk.BooleanVar(),
  67.     "bmp": tk.BooleanVar(),
  68.     "webp": tk.BooleanVar(),
  69. }
  70.  
  71. def update_query_preview():
  72.     selected_formats = [f for f, var in format_vars.items() if var.get()]
  73.     custom_format = extra_format_var.get().strip()
  74.     keyword = keyword_var.get().strip()
  75.     site = site_var.get().strip()
  76.     all_formats = selected_formats.copy()
  77.     if custom_format:
  78.         all_formats.append(custom_format)
  79.     filetype_str = ' | '.join(all_formats) if all_formats else "jpg | png"
  80.     query = 'intitle:"index of"'
  81.     if keyword:
  82.         query += f' ({keyword})'
  83.     query += f' ({filetype_str})'
  84.     if site:
  85.         query += f' site:{site}'
  86.     query_preview_var.set(query)
  87.  
  88. def perform_search():
  89.     query = query_preview_var.get()
  90.     if not query:
  91.         result_text.delete("1.0", tk.END)
  92.         result_text.insert(tk.END, "⚠️ Query is empty.")
  93.         return
  94.     url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
  95.     result_text.delete("1.0", tk.END)
  96.     result_text.insert(tk.END, f"🔍 Google Search URL:\n{url}")
  97.     browser = browser_var.get()
  98.     try:
  99.         if browser == "chrome":
  100.             webbrowser.get("chrome").open(url)
  101.         elif browser == "firefox":
  102.             webbrowser.get("firefox").open(url)
  103.         else:
  104.             webbrowser.open(url)
  105.     except webbrowser.Error:
  106.         result_text.insert(tk.END, f"\n⚠️ Failed to open {browser}, using default browser instead.")
  107.         webbrowser.open(url)
  108.  
  109. def toggle_dark_mode():
  110.     dark = dark_mode_var.get()
  111.     bg = "#1e1e1e" if dark else "#ffffff"
  112.     fg = "#ffffff" if dark else "#000000"
  113.     widgets = [tab1, format_frame, keyword_label, keyword_entry,
  114.                site_label, site_entry, extra_label, extra_entry,
  115.                preview_label, preview_entry, search_button, dark_mode_check,
  116.                browser_label, result_label, result_text]
  117.     for widget in widgets:
  118.         try:
  119.             widget.config(bg=bg, fg=fg)
  120.         except:
  121.             pass
  122.     keyword_entry.config(insertbackground=fg)
  123.     site_entry.config(insertbackground=fg)
  124.     extra_entry.config(insertbackground=fg)
  125.     result_text.config(insertbackground=fg)
  126.  
  127. # Tab 1 Layout
  128. tk.Label(tab1, text="Select Image Formats:", bg="#ffffff").pack(anchor="w", padx=10, pady=5)
  129. format_frame = tk.Frame(tab1, bg="#ffffff")
  130. format_frame.pack(anchor="w", padx=20)
  131. for fmt, var in format_vars.items():
  132.     cb = tk.Checkbutton(format_frame, text=fmt, variable=var, bg="#ffffff", command=update_query_preview)
  133.     cb.pack(side="left", padx=5)
  134.  
  135. extra_label = tk.Label(tab1, text="Type any extra format or word (e.g. tif, raw):", bg="#ffffff")
  136. extra_label.pack(anchor="w", padx=10, pady=5)
  137. extra_entry = tk.Entry(tab1, textvariable=extra_format_var, width=60, bg="#ffffff", fg="#000000")
  138. extra_entry.pack(padx=10)
  139. extra_entry.bind("<KeyRelease>", lambda e: update_query_preview())
  140.  
  141. keyword_label = tk.Label(tab1, text="Enter Keywords (e.g. wallpaper | backgrounds):", bg="#ffffff")
  142. keyword_label.pack(anchor="w", padx=10, pady=5)
  143. keyword_entry = tk.Entry(tab1, textvariable=keyword_var, width=60, bg="#ffffff", fg="#000000")
  144. keyword_entry.pack(padx=10)
  145. keyword_entry.bind("<KeyRelease>", lambda e: update_query_preview())
  146.  
  147. site_label = tk.Label(tab1, text="Optional Site Filter (e.g. .edu, example.com):", bg="#ffffff")
  148. site_label.pack(anchor="w", padx=10, pady=5)
  149. site_entry = tk.Entry(tab1, textvariable=site_var, width=60, bg="#ffffff", fg="#000000")
  150. site_entry.pack(padx=10)
  151. site_entry.bind("<KeyRelease>", lambda e: update_query_preview())
  152.  
  153. preview_label = tk.Label(tab1, text="🔎 Search Query Preview:", bg="#ffffff", font=("Arial", 10, "bold"))
  154. preview_label.pack(anchor="w", padx=10, pady=5)
  155. preview_entry = tk.Entry(tab1, textvariable=query_preview_var, width=80, state="readonly", bg="#eeeeee")
  156. preview_entry.pack(padx=10, pady=5)
  157.  
  158. browser_label = tk.Label(tab1, text="Select Browser:", bg="#ffffff")
  159. browser_label.pack(anchor="w", padx=10, pady=5)
  160. browser_frame = tk.Frame(tab1, bg="#ffffff")
  161. browser_frame.pack(anchor="w", padx=20)
  162. tk.Radiobutton(browser_frame, text="Default", variable=browser_var, value="default", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
  163. tk.Radiobutton(browser_frame, text="Chrome", variable=browser_var, value="chrome", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
  164. tk.Radiobutton(browser_frame, text="Firefox", variable=browser_var, value="firefox", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
  165.  
  166. search_button = tk.Button(tab1, text="Search on Google", command=perform_search)
  167. search_button.pack(pady=10)
  168.  
  169. dark_mode_check = tk.Checkbutton(tab1, text="Dark Mode", variable=dark_mode_var, command=toggle_dark_mode, bg="#ffffff")
  170. dark_mode_check.pack()
  171.  
  172. result_label = tk.Label(tab1, text="Generated Google Search URL:", bg="#ffffff")
  173. result_label.pack(anchor="w", padx=10, pady=5)
  174. result_text = tk.Text(tab1, height=4, width=80, wrap="word", bg="#f8f8f8")
  175. result_text.pack(padx=10, pady=5)
  176.  
  177. update_query_preview()
  178.  
  179. # ====================
  180. # === Tab 2 Content ===
  181. # ====================
  182. media_urls = []
  183. special_sites = ['youtube.com', 'youtu.be', 'facebook.com', 'fb.watch', 'tiktok.com', 'instagram.com']
  184. image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.ico']
  185. video_exts = ['.mp4', '.webm', '.ogg', '.mov', '.avi', '.mkv', '.flv', '.3gp', '.wmv', '.m3u', '.m3u8']
  186. stop_download_flag = False
  187.  
  188. def is_special_site(url):
  189.     return any(domain in url for domain in special_sites)
  190.  
  191. def browse_url_file():
  192.     file_path = filedialog.askopenfilename(title="Open URL File", filetypes=[("Text files", "*.txt")])
  193.     if file_path:
  194.         with open(file_path, 'r') as f:
  195.             for line in f:
  196.                 url = line.strip()
  197.                 if url and url not in media_urls:
  198.                     media_urls.append(url)
  199.                     result_box.insert(tk.END, url + "\n")
  200.  
  201. def save_urls_to_file():
  202.     file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
  203.     if file_path:
  204.         with open(file_path, 'w') as f:
  205.             f.write(result_box.get("1.0", tk.END).strip())
  206.         messagebox.showinfo("Saved", f"URLs saved to {file_path}")
  207.  
  208. def scrape_normal_site(url):
  209.     found_urls = set()
  210.     try:
  211.         response = requests.get(url, timeout=10)
  212.         if response.status_code != 200:
  213.             return found_urls
  214.         soup = BeautifulSoup(response.text, 'html.parser')
  215.         for tag in soup.find_all(['img', 'video', 'source', 'a']):
  216.             src = tag.get('src') or tag.get('href')
  217.             if src:
  218.                 full_url = urljoin(url, src)
  219.                 parsed = urlparse(full_url)
  220.                 ext = os.path.splitext(parsed.path)[1].lower()
  221.                 if ext in image_exts + video_exts:
  222.                     found_urls.add(full_url)
  223.     except Exception:
  224.         pass
  225.     return found_urls
  226.  
  227. def process_url():
  228.     url = url_entry.get().strip()
  229.     if not url:
  230.         messagebox.showwarning("Input Error", "Please enter a valid URL.")
  231.         return
  232.     media_urls.clear()
  233.     result_box.delete("1.0", tk.END)
  234.     try:
  235.         if is_special_site(url):
  236.             ydl_opts = {
  237.                 'quiet': True,
  238.                 'skip_download': True,
  239.                 'force_generic_extractor': False
  240.             }
  241.             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
  242.                 info = ydl.extract_info(url, download=False)
  243.                 if 'entries' in info:
  244.                     for entry in info['entries']:
  245.                         media_urls.append(entry['webpage_url'])
  246.                         result_box.insert(tk.END, entry['webpage_url'] + "\n")
  247.                 else:
  248.                     media_urls.append(info['webpage_url'])
  249.                     result_box.insert(tk.END, info['webpage_url'] + "\n")
  250.         else:
  251.             scraped = scrape_normal_site(url)
  252.             media_urls.extend(scraped)
  253.             for media_url in scraped:
  254.                 result_box.insert(tk.END, media_url + "\n")
  255.         if not media_urls:
  256.             messagebox.showinfo("Info", "No media URLs found.")
  257.         else:
  258.             messagebox.showinfo("Success", f"{len(media_urls)} media URL(s) found!")
  259.     except Exception as e:
  260.         messagebox.showerror("Error", str(e))
  261.  
  262. def download_media(url, save_path):
  263.     try:
  264.         if is_special_site(url):
  265.             ytdlp_path = shutil.which("yt-dlp") or r"C:\Windows\yt-dlp.exe"
  266.             command = [
  267.                 ytdlp_path,
  268.                 "-f", "best",
  269.                 "--no-playlist",
  270.                 "--extractor-args", "youtube:player_client=web",
  271.                 "-o", save_path,
  272.                 url
  273.             ]
  274.             result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
  275.             if result.returncode != 0:
  276.                 raise Exception(result.stderr.strip())
  277.         else:
  278.             response = requests.get(url, stream=True)
  279.             if response.status_code == 200:
  280.                 with open(save_path, 'wb') as f:
  281.                     for chunk in response.iter_content(1024):
  282.                         f.write(chunk)
  283.     except Exception as e:
  284.         messagebox.showerror("Download Error", f"Failed to download:\n{url}\n{str(e)}")
  285.  
  286. def download_selected_line():
  287.     try:
  288.         line_index = result_box.index(tk.INSERT).split(".")[0]
  289.         selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
  290.         if not selected_url:
  291.             raise Exception("No line selected.")
  292.         folder = filedialog.askdirectory(title="Select Folder to Save File")
  293.         if not folder:
  294.             return
  295.         parsed = urlparse(selected_url)
  296.         filename = os.path.basename(parsed.path)
  297.         if not filename:
  298.             filename = "downloaded_file"
  299.         save_path = os.path.join(folder, filename)
  300.         threading.Thread(target=threaded_download, args=(selected_url, save_path), daemon=True).start()
  301.     except Exception as e:
  302.         messagebox.showerror("Error", str(e))
  303.  
  304. def download_selected():
  305.     selected_urls = result_box.get("1.0", tk.END).strip().splitlines()
  306.     if not selected_urls:
  307.         messagebox.showwarning("Selection Error", "No URLs to download.")
  308.         return
  309.     selected = filedialog.askdirectory(title="Select Folder to Save Files")
  310.     if not selected:
  311.         return
  312.     for url in selected_urls:
  313.         parsed = urlparse(url)
  314.         filename = os.path.basename(parsed.path)
  315.         if not filename:
  316.             filename = "downloaded_file.mp4"
  317.         save_path = os.path.join(selected, filename)
  318.         download_media(url, save_path)
  319.     messagebox.showinfo("Download Complete", f"Downloaded {len(selected_urls)} media files.")
  320.  
  321. def threaded_download(url, save_path):
  322.     global stop_download_flag
  323.     stop_download_flag = False
  324.     try:
  325.         if is_special_site(url):
  326.             ytdlp_path = shutil.which("yt-dlp") or r"C:\Windows\yt-dlp.exe"
  327.             command = [
  328.                 ytdlp_path,
  329.                 "-f", "mp4",
  330.                 "--no-part",  # Saves directly as .mp4
  331.                 "--downloader", "ffmpeg",
  332.                 "--downloader-args", "ffmpeg_i:-movflags +faststart",
  333.                 "-o", save_path,
  334.                 url
  335.             ]
  336.             proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
  337.             while proc.poll() is None:
  338.                 if stop_download_flag:
  339.                     proc.kill()
  340.                     break
  341.         else:
  342.             response = requests.get(url, stream=True, timeout=10)
  343.             if response.status_code == 200:
  344.                 with open(save_path, 'wb') as f:
  345.                     for chunk in response.iter_content(1024 * 1024):  # 1MB
  346.                         if stop_download_flag:
  347.                             break
  348.                         if chunk:
  349.                             f.write(chunk)
  350.         if stop_download_flag:
  351.             fix_partial_video(save_path)  # Try to repair it
  352.             messagebox.showinfo("Download Stopped", f"Download was stopped by user.\nSaved: {save_path}")
  353.         else:
  354.             messagebox.showinfo("Download Complete", f"Downloaded successfully to:\n{save_path}")
  355.     except Exception as e:
  356.         messagebox.showerror("Download Error", str(e))
  357.  
  358. def stop_download():
  359.     global stop_download_flag
  360.     stop_download_flag = True
  361.  
  362. def fix_partial_video(input_path):
  363.     try:
  364.         if not os.path.exists(input_path) or not input_path.lower().endswith(".mp4"):
  365.             return
  366.         output_path = input_path.replace(".mp4", "_fixed.mp4")
  367.         ffmpeg_path = shutil.which("ffmpeg") or r"C:\Program Files\ffmpeg\bin\ffmpeg.exe"
  368.         # Try quick remux
  369.         command = [
  370.             ffmpeg_path,
  371.             "-y",
  372.             "-i", input_path,
  373.             "-c", "copy",
  374.             "-movflags", "+faststart",
  375.             output_path
  376.         ]
  377.         result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
  378.         # Fallback to re-encode if remux fails or small file
  379.         if result.returncode != 0 or not os.path.exists(output_path) or os.path.getsize(output_path) < 1024 * 1024:
  380.             print("[INFO] Remux failed or file too small, retrying with re-encode...")
  381.             command = [
  382.                 ffmpeg_path,
  383.                 "-y",
  384.                 "-i", input_path,
  385.                 "-preset", "ultrafast",
  386.                 output_path
  387.             ]
  388.             subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
  389.         # Replace original file if fixed
  390.         if os.path.exists(output_path):
  391.             os.remove(input_path)
  392.             os.rename(output_path, input_path)
  393.     except Exception as e:
  394.         print(f"[FFmpeg Fix Error] {e}")
  395.  
  396. def scrape_all_links(url):
  397.     try:
  398.         response = requests.get(url, timeout=10)
  399.         response.raise_for_status()
  400.         soup = BeautifulSoup(response.text, 'html.parser')
  401.         links = []
  402.         for tag in soup.find_all('a', href=True):
  403.             href = tag['href']
  404.             full_url = urljoin(url, href)
  405.             parsed_url = urlparse(full_url)
  406.             if parsed_url.scheme in ['http', 'https']:
  407.                 links.append(full_url)
  408.         return links
  409.     except requests.exceptions.RequestException as e:
  410.         messagebox.showerror("Network Error", f"Failed to scrape links: {e}")
  411.         return []
  412.     except Exception as e:
  413.         messagebox.showerror("Error", f"An unexpected error occurred: {e}")
  414.         return []
  415.  
  416. def scrape_all_button():
  417.     url = url_entry.get().strip()
  418.     if not url:
  419.         messagebox.showwarning("Input Error", "Please enter a valid URL.")
  420.         return
  421.     result_box.delete("1.0", tk.END)
  422.     media_urls.clear()
  423.     all_links = scrape_all_links(url)
  424.     media_urls.extend(all_links)
  425.     for link in all_links:
  426.         result_box.insert(tk.END, link + "\n")
  427.     messagebox.showinfo("Done", f"{len(all_links)} total link(s) scraped.")
  428.  
  429. def open_in_vlc():
  430.     line_index = result_box.index(tk.INSERT).split(".")[0]
  431.     selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
  432.     if not selected_url:
  433.         messagebox.showwarning("No Selection", "Select a valid media URL.")
  434.         return
  435.     #vlc_path = shutil.which("vlc")
  436.     vlc_path = r"C:\Program Files\VideoLAN\VLC\vlc.exe"
  437.     if not vlc_path:
  438.         messagebox.showerror("VLC Error", "VLC is not installed or not found in PATH.")
  439.         return
  440.     try:
  441.         subprocess.Popen([vlc_path, selected_url])
  442.     except Exception as e:
  443.         messagebox.showerror("VLC Error", f"Could not open VLC: {e}")
  444.  
  445. def preview_image_popup():
  446.     try:
  447.         line_index = result_box.index(tk.INSERT).split(".")[0]
  448.         selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
  449.         if not selected_url.lower().endswith(tuple(image_exts)):
  450.             messagebox.showerror("Preview Error", "Selected link is not an image.")
  451.             return
  452.         response = requests.get(selected_url, timeout=10)
  453.         if response.status_code != 200:
  454.             messagebox.showerror("Preview Error", "Failed to load image.")
  455.             return
  456.         image = Image.open(io.BytesIO(response.content))
  457.         popup = tk.Toplevel(window)
  458.         popup.title("Image Preview")
  459.         popup.geometry("600x600")
  460.         img_resized = image.resize((500, 500), Image.ANTIALIAS)
  461.         img_tk = ImageTk.PhotoImage(img_resized)
  462.         label = tk.Label(popup, image=img_tk)
  463.         label.image = img_tk
  464.         label.pack()
  465.     except Exception as e:
  466.         messagebox.showerror("Preview Error", str(e))
  467.  
  468. def load_m3u_file():
  469.     file_path = filedialog.askopenfilename(title="Open M3U File", filetypes=[("M3U/M3U8 Files", "*.m3u *.m3u8")])
  470.     if file_path:
  471.         result_box.delete("1.0", tk.END)
  472.         media_urls.clear()
  473.         with open(file_path, 'r', encoding="utf-8", errors="ignore") as f:
  474.             for line in f:
  475.                 url = line.strip()
  476.                 if url and url.startswith("http"):
  477.                     media_urls.append(url)
  478.                     result_box.insert(tk.END, url + "\n")
  479.         messagebox.showinfo("Loaded", f"{len(media_urls)} media URLs loaded from playlist.")
  480.  
  481. def load_online_m3u():
  482.     url = url_entry.get().strip()
  483.     if not url.lower().endswith((".m3u", ".m3u8")):
  484.         messagebox.showwarning("URL Error", "Please enter a valid .m3u or .m3u8 URL.")
  485.         return
  486.     try:
  487.         response = requests.get(url, timeout=10)
  488.         if response.status_code != 200:
  489.             raise Exception("Unable to fetch playlist.")
  490.         result_box.delete("1.0", tk.END)
  491.         media_urls.clear()
  492.         for line in response.text.splitlines():
  493.             line = line.strip()
  494.             if line and line.startswith("http"):
  495.                 media_urls.append(line)
  496.                 result_box.insert(tk.END, line + "\n")
  497.         messagebox.showinfo("Online M3U Loaded", f"{len(media_urls)} stream(s) loaded.")
  498.     except Exception as e:
  499.         messagebox.showerror("Error", str(e))
  500.  
  501. def scrape_xtream_m3u_url():
  502.     url = url_entry.get().strip()
  503.     if not url or "get.php" not in url:
  504.         messagebox.showwarning("Input Error", "Please enter a valid Xtream M3U URL.")
  505.         return
  506.     try:
  507.         headers = {
  508.             "User-Agent": "VLC/3.0.18 LibVLC/3.0.18"
  509.         }
  510.         response = requests.get(url, headers=headers, timeout=15)
  511.         if response.status_code == 404:
  512.             raise Exception("404 Not Found — the playlist URL might be wrong or expired.")
  513.         if response.status_code != 200:
  514.             raise Exception(f"Failed to fetch playlist. Status code: {response.status_code}")
  515.         content = response.text
  516.         if "#EXTM3U" not in content:
  517.             raise Exception("Invalid playlist. No M3U content found.")
  518.         result_box.delete("1.0", tk.END)
  519.         media_urls.clear()
  520.         for line in content.splitlines():
  521.             if line.startswith("http"):
  522.                 media_urls.append(line)
  523.                 result_box.insert(tk.END, line + "\n")
  524.         if media_urls:
  525.             messagebox.showinfo("Success", f"Scraped {len(media_urls)} stream URLs from Xtream playlist.")
  526.         else:
  527.             messagebox.showwarning("No URLs", "Playlist loaded, but no stream URLs found.")
  528.     except Exception as e:
  529.         messagebox.showerror("Error", str(e))
  530.  
  531. def search_urls():
  532.     query = search_entry.get().strip().lower()
  533.     if not query:
  534.         return
  535.     result_box.tag_remove("highlight", "1.0", tk.END)
  536.     lines = result_box.get("1.0", tk.END).splitlines()
  537.     for i, line in enumerate(lines, 1):
  538.         if query in line.lower():
  539.             result_box.tag_add("highlight", f"{i}.0", f"{i}.end")
  540.     result_box.tag_config("highlight", background="yellow", foreground="black")
  541.  
  542. def save_as_m3u():
  543.     """
  544.    Saves the contents of the result box as an M3U/M3U8 playlist file.
  545.    """
  546.     file_path = filedialog.asksaveasfilename(
  547.         defaultextension=".m3u",
  548.         filetypes=[("Text File", "*.txt"), ("M3U Playlist", "*.m3u"), ("M3U8 Playlist", "*.m3u8")]
  549.     )
  550.     if file_path:
  551.         try:
  552.             with open(file_path, 'w', encoding="utf-8") as f:
  553.                 # Write content from the result box to the file
  554.                 f.write(result_box.get("1.0", tk.END).strip())
  555.             messagebox.showinfo("Saved", f"Playlist saved to:\n{file_path}")
  556.         except Exception as e:
  557.             messagebox.showerror("Save Error", f"Failed to save playlist:\n{str(e)}")
  558.  
  559. def clear_url_field():
  560.     """
  561.    Clears the URL entry field.
  562.    """
  563.     url_entry.delete(0, tk.END)
  564.  
  565. def clear_result_box():
  566.     """
  567.    Clears the result box and resets the media URLs list.
  568.    """
  569.     result_box.delete("1.0", tk.END)
  570.     media_urls.clear()
  571.  
  572. def clear_search():
  573.     """
  574.    Clears the search entry field and removes highlights from the result box.
  575.    """
  576.     search_entry.delete(0, tk.END)
  577.     result_box.tag_remove("highlight", "1.0", tk.END)
  578.  
  579. def scrape_directory_media(url):
  580.     """
  581.    Scrape media URLs from subdirectories of the given URL.
  582.    :param url: The base URL to start scraping from.
  583.    """
  584.     global media_urls
  585.     result_box.delete("1.0", tk.END)  # Fix: Replace END with tk.END
  586.     media_urls.clear()
  587.  
  588.     def extract_directories(soup, base_url):
  589.         """
  590.        Extract directory links from the page.
  591.        :param soup: BeautifulSoup object of the page.
  592.        :param base_url: Base URL to resolve relative paths.
  593.        :return: List of directory URLs.
  594.        """
  595.         directories = []
  596.         for a_tag in soup.find_all('a', href=True):
  597.             href = a_tag['href']
  598.             if href.endswith("/") and not href.startswith("#"):  # Subdirectory link
  599.                 full_href = urljoin(base_url, href)
  600.                 if full_href != base_url:  # Avoid infinite loops
  601.                     directories.append(full_href)
  602.         return directories
  603.  
  604.     def extract_media_urls(soup, base_url):
  605.         """
  606.        Extract media URLs from the page.
  607.        :param soup: BeautifulSoup object of the page.
  608.        :param base_url: Base URL to resolve relative paths.
  609.        :return: Set of media URLs.
  610.        """
  611.         media_links = set()
  612.         for tag in soup.find_all(['img', 'video', 'source', 'a']):
  613.             src = tag.get('src') or tag.get('href')
  614.             if src:
  615.                 full_url = urljoin(base_url, src)
  616.                 parsed = urlparse(full_url)
  617.                 ext = os.path.splitext(parsed.path)[1].lower()
  618.                 if ext in image_exts + video_exts:
  619.                     media_links.add(full_url)
  620.         return media_links
  621.  
  622.     try:
  623.         # Fetch the base URL content
  624.         response = requests.get(url, timeout=10)
  625.         if response.status_code != 200:
  626.             messagebox.showerror("Error", f"Failed to fetch {url} (Status Code: {response.status_code})")
  627.             return
  628.  
  629.         soup = BeautifulSoup(response.text, 'html.parser')
  630.  
  631.         # Step 1: Extract all subdirectories
  632.         directories = extract_directories(soup, url)
  633.  
  634.         # Step 2: Scrape media URLs from each subdirectory
  635.         found_media = False
  636.         for directory in directories:
  637.             try:
  638.                 dir_response = requests.get(directory, timeout=10)
  639.                 if dir_response.status_code == 200:
  640.                     dir_soup = BeautifulSoup(dir_response.text, 'html.parser')
  641.                     media_links = extract_media_urls(dir_soup, directory)
  642.                     if media_links:
  643.                         found_media = True
  644.                         for media_url in media_links:
  645.                             if media_url not in media_urls:
  646.                                 media_urls.append(media_url)
  647.                                 result_box.insert(tk.END, media_url + "\n")  # Fix: Replace END with tk.END
  648.             except Exception as e:
  649.                 print(f"Error scraping directory {directory}: {e}")
  650.  
  651.         if not found_media:
  652.             messagebox.showinfo("Info", "No media URLs found in subdirectories.")
  653.         else:
  654.             messagebox.showinfo("Success", f"{len(media_urls)} media URL(s) found!")
  655.  
  656.     except Exception as e:
  657.         messagebox.showerror("Error", str(e))
  658.        
  659. # Tab 2 Layout
  660. tk.Label(tab2, text="Enter URL to Scrape Media:").pack(pady=5)
  661. search_frame = tk.Frame(tab2)
  662. search_frame.pack(pady=5)
  663. search_entry = tk.Entry(search_frame, width=40)
  664. search_entry.pack(side=tk.LEFT, padx=5)
  665. tk.Button(search_frame, text="Search", command=search_urls, bg="lightblue").pack(side=tk.LEFT, padx=5)
  666. url_entry = tk.Entry(search_frame, width=100)
  667. url_entry.pack(pady=5)
  668.  
  669. frame_buttons = tk.Frame(tab2)
  670. frame_buttons.pack(pady=5)
  671. tk.Button(frame_buttons, text="Scrape Media", command=process_url, bg="lightgreen", width=20).pack(side=tk.LEFT, padx=5)
  672. tk.Button(frame_buttons, text="Browse URL File", command=browse_url_file, bg="lightyellow", width=20).pack(side=tk.LEFT, padx=5)
  673. tk.Button(frame_buttons, text="Download All URLs", command=download_selected, bg="lightblue", width=20).pack(side=tk.LEFT, padx=5)
  674. tk.Button(frame_buttons, text="Download Selected URL", command=download_selected_line, bg="orange", width=20).pack(side=tk.LEFT, padx=5)
  675. tk.Button(frame_buttons, text="Save URLs to File", command=save_urls_to_file, bg="lightgray", width=20).pack(side=tk.LEFT, padx=5)
  676. tk.Button(frame_buttons, text="Stop Download", command=stop_download, bg="red", width=20).pack(side=tk.LEFT, padx=5)
  677.  
  678. frame_button = tk.Frame(tab2)
  679. frame_button.pack(pady=5)
  680. tk.Button(frame_button, text="Scrape All Links", command=scrape_all_button, bg="#e0c3fc", width=20).pack(side=tk.LEFT, padx=5)
  681. tk.Button(frame_button, text="Open in VLC", command=open_in_vlc, bg="#c1f0c1", width=20).pack(side=tk.LEFT, padx=5)
  682. tk.Button(frame_button, text="Preview Image", command=preview_image_popup, bg="#f0c1c1", width=20).pack(side=tk.LEFT, padx=5)
  683. tk.Button(frame_button, text="Load Online M3U", command=load_online_m3u, bg="#c9f2ff", width=20).pack(side=tk.LEFT, padx=5)
  684. tk.Button(frame_button, text="Scrape Xtream M3U", command=scrape_xtream_m3u_url, bg="#fff0b3", width=20).pack(side=tk.LEFT, padx=5)
  685. tk.Button(frame_button, text="Load M3U File", command=load_m3u_file, bg="#d0f0fd", width=20).pack(side=tk.LEFT, padx=5)
  686.  
  687. result_frame = tk.Frame(tab2)
  688. result_frame.pack(pady=5)
  689. scrollbar = tk.Scrollbar(result_frame)
  690. scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
  691. result_box = tk.Text(result_frame, height=30, width=124, yscrollcommand=scrollbar.set)
  692. result_box.pack(side=tk.LEFT, fill=tk.BOTH)
  693. scrollbar.config(command=result_box.yview)
  694.  
  695. frame_clear = tk.Frame(tab2)
  696. frame_clear.pack(pady=5)
  697.  
  698. tk.Button(frame_clear, text="Save Result", command=save_as_m3u, bg="#a7ffcc", width=20).pack(side=tk.LEFT, padx=5)
  699. tk.Button(frame_clear, text="Clear Search", command=clear_search, bg="lightgray").pack(side=tk.LEFT, padx=2)
  700. tk.Button(frame_clear, text="Clear URL Field", command=clear_url_field, bg="#ffd580", width=20).pack(side=tk.LEFT, padx=5)
  701. tk.Button(frame_clear, text="Clear Result Field", command=clear_result_box, bg="#ffb3b3", width=20).pack(side=tk.LEFT, padx=5)
  702. # Add a button for scraping subdirectories
  703. tk.Button(frame_clear, text="Scrape Subdirectories", command=lambda: scrape_directory_media(url_entry.get().strip()), bg="#ffcccb", width=20).pack(side=tk.LEFT, padx=5)
  704.  
  705. # ====================
  706. # === Tab 3 Content ===
  707. # ====================
  708. tab3 = ttk.Frame(notebook)
  709. notebook.add(tab3, text="Web Scraper")
  710. notebook.pack(expand=True, fill="both")
  711.  
  712. class WebScraperGUI:
  713.     def __init__(self, root):
  714.         self.root = root
  715.  
  716.         # Configure the style for ttk.Frame
  717.         self.style = ttk.Style()
  718.         self.style.configure("Background.TFrame", background="#336699")  # Define a custom style
  719.         self.root.config(style="Background.TFrame")  # Apply the style to the root frame
  720.  
  721.         # URL Entry
  722.         self.url_label = ttk.Label(root, text="Enter URL:")
  723.         self.url_label.grid(column=0, row=0, sticky=tk.W, padx=10, pady=5)
  724.         self.url_entry = ttk.Entry(root, width=120)
  725.         self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W, padx=10, pady=5)
  726.  
  727.         # Options
  728.         self.options_label = ttk.Label(root, text="Select Options:")
  729.         self.options_label.grid(column=0, row=1, sticky=tk.W, padx=10, pady=5)
  730.  
  731.         # Checkboxes
  732.         self.check_var_html = tk.BooleanVar()
  733.         self.check_var_heading = tk.BooleanVar()
  734.         self.check_var_paragraph = tk.BooleanVar()
  735.         self.check_var_css = tk.BooleanVar()
  736.         self.check_var_table = tk.BooleanVar()
  737.         self.check_var_links = tk.BooleanVar()
  738.         self.check_var_files = tk.BooleanVar()
  739.  
  740.         self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
  741.         self.html_check.grid(column=1, row=1, sticky=tk.W, padx=10, pady=5)
  742.  
  743.         self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
  744.         self.heading_check.grid(column=2, row=1, sticky=tk.W, padx=10, pady=5)
  745.  
  746.         self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
  747.         self.paragraph_check.grid(column=3, row=1, sticky=tk.W, padx=10, pady=5)
  748.  
  749.         self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
  750.         self.css_check.grid(column=4, row=1, sticky=tk.W, padx=10, pady=5)
  751.  
  752.         self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
  753.         self.table_check.grid(column=1, row=2, sticky=tk.W, padx=10, pady=5)
  754.  
  755.         self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
  756.         self.links_check.grid(column=2, row=2, sticky=tk.W, padx=10, pady=5)
  757.  
  758.         self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
  759.         self.files_check.grid(column=3, row=2, sticky=tk.W, padx=10, pady=5)
  760.  
  761.         # Result Text Field
  762.         self.result_label = ttk.Label(root, text="Scraped Content of Websites:")
  763.         self.result_label.grid(column=0, row=4, sticky=tk.W, padx=10, pady=5)
  764.  
  765.         #self.result_text = scrolledtext.ScrolledText(root, width=110, height=33, wrap=tk.WORD)
  766.         self.result_text = scrolledtext.ScrolledText(root, width=116, height=33, wrap=tk.WORD, bg="#f0f0f0")
  767.         self.result_text.grid(column=0, row=5, columnspan=5)
  768.  
  769.         # Scrape Button
  770.         self.scrape_button = ttk.Button(root, text="SCRAPE", command=self.scrape)
  771.         self.scrape_button.grid(column=4, row=4, columnspan=8, pady=10)
  772.  
  773.         # Save Result Button
  774.         self.save_result_button = ttk.Button(root, text="Save Result", command=self.save_result, style='Red.TButton')
  775.         self.save_result_button.grid(column=2, row=4, columnspan=8, pady=10)
  776.  
  777.         # Define style for the "Save Result" button
  778.         self.style.configure('Red.TButton', foreground='red')
  779.  
  780.     def scrape(self):
  781.         url = self.url_entry.get()
  782.         if not url:
  783.             messagebox.showwarning("Input Error", "Please enter a valid URL.")
  784.             return
  785.  
  786.         options = {
  787.             'html': self.check_var_html.get(),
  788.             'heading': self.check_var_heading.get(),
  789.             'paragraph': self.check_var_paragraph.get(),
  790.             'css': self.check_var_css.get(),
  791.             'table': self.check_var_table.get(),
  792.             'links': self.check_var_links.get(),
  793.             'files': self.check_var_files.get()
  794.         }
  795.  
  796.         try:
  797.             response = requests.get(url)
  798.             response.raise_for_status()
  799.             soup = BeautifulSoup(response.content, 'html.parser')
  800.  
  801.             result = ""
  802.             if options['html']:
  803.                 result += str(soup) + '\n\n'
  804.  
  805.             if options['heading']:
  806.                 headings = soup.find_all(re.compile('^h[1-6]$'))
  807.                 result += "Headings:\n"
  808.                 for heading in headings:
  809.                     result += heading.text.strip() + '\n'
  810.                 result += '\n'
  811.  
  812.             if options['paragraph']:
  813.                 paragraphs = soup.find_all('p')
  814.                 result += "Paragraphs:\n"
  815.                 for paragraph in paragraphs:
  816.                     result += paragraph.text.strip() + '\n'
  817.                 result += '\n'
  818.  
  819.             if options['css']:
  820.                 css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
  821.                 result += "CSS Links:\n"
  822.                 for css_link in css_links:
  823.                     full_url = urljoin(url, css_link)
  824.                     result += full_url + '\n'
  825.                 result += '\n'
  826.  
  827.             if options['table']:
  828.                 tables = soup.find_all('table')
  829.                 result += "Tables:\n"
  830.                 for table in tables:
  831.                     result += str(table) + '\n'
  832.                 result += '\n'
  833.  
  834.             if options['links']:
  835.                 links = soup.find_all('a', href=True)
  836.                 result += "Links:\n"
  837.                 for link in links:
  838.                     if link['href'].startswith('http'):
  839.                         result += f"Text: {link.text.strip()}, URL: {link['href']}\n"
  840.                     else:
  841.                         full_url = urljoin(url, link['href'])
  842.                         result += f"Text: {link.text.strip()}, URL: {full_url}\n"
  843.                 result += '\n'
  844.  
  845.             if options['files']:
  846.                 file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.[^.]+$', link['href'])]
  847.                 result += "File Links:\n"
  848.                 for file_link in file_links:
  849.                     full_url = urljoin(url, file_link)
  850.                     result += full_url + '\n'
  851.                 result += '\n'
  852.  
  853.             self.result_text.delete(1.0, tk.END)
  854.             self.result_text.insert(tk.END, result)
  855.  
  856.         except requests.exceptions.RequestException as e:
  857.             messagebox.showerror("Network Error", f"Failed to fetch URL: {e}")
  858.         except Exception as e:
  859.             messagebox.showerror("Error", f"An unexpected error occurred: {e}")
  860.  
  861.     def save_result(self):
  862.         result_text = self.result_text.get(1.0, tk.END)
  863.         if not result_text.strip():
  864.             messagebox.showwarning("Empty Result", "No content to save.")
  865.             return
  866.  
  867.         file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
  868.         if file_path:
  869.             try:
  870.                 with open(file_path, "w", encoding="utf-8") as file:
  871.                     file.write(result_text)
  872.                 messagebox.showinfo("Success", f"Result saved to {file_path}")
  873.             except Exception as e:
  874.                 messagebox.showerror("Save Error", f"Failed to save file: {e}")
  875.  
  876.  
  877. # Initialize WebScraperGUI in Tab 3
  878. web_scraper_gui = WebScraperGUI(tab3)
  879. # Run
  880. window.mainloop()
  881.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement