Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tkinter as tk
- from tkinter import END
- from tkinter import ttk, messagebox, filedialog
- import webbrowser
- import os
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin, urlparse
- import yt_dlp
- import subprocess
- from PIL import Image, ImageTk
- import io
- import threading
- import shutil
- # ------------------------------
- from tkinter import scrolledtext
- stop_download_flag = False
- #================ADD-IMAGE-ICON=================
- import sys
- def resource_path(relative_path):
- """ Get the absolute path to the resource, works for PyInstaller. """
- if getattr(sys, '_MEIPASS', False):
- return os.path.join(sys._MEIPASS, relative_path)
- return os.path.join(os.path.abspath("."), relative_path)
- # Use this function to load files:
- #splash_image = resource_path("splash-1.png")
- icon_path = resource_path("D.ico")
- #================ADD-IMAGE-ICON=================
- # Register browsers with full path
- chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
- firefox_path = r"C:\Program Files\Mozilla Firefox\firefox.exe"
- if os.path.exists(chrome_path):
- webbrowser.register("chrome", None, webbrowser.BackgroundBrowser(chrome_path))
- if os.path.exists(firefox_path):
- webbrowser.register("firefox", None, webbrowser.BackgroundBrowser(firefox_path))
- # === Main Window ===
- window = tk.Tk()
- window.title("NAJEEB SHAH KHAN SCRAPE WEB & Image Search Tool & Media Downloader")
- window.geometry("965x700")
- #window.configure(bg="#2c3e50")
- window.iconbitmap(icon_path)
- notebook = ttk.Notebook(window)
- tab1 = ttk.Frame(notebook)
- tab2 = ttk.Frame(notebook)
- notebook.add(tab1, text="Image Search Tool")
- notebook.add(tab2, text="Media Downloader")
- notebook.pack(expand=True, fill="both")
- # ====================
- # === Tab 1 Content ===
- # ====================
- dark_mode_var = tk.BooleanVar()
- keyword_var = tk.StringVar()
- site_var = tk.StringVar()
- extra_format_var = tk.StringVar()
- query_preview_var = tk.StringVar()
- browser_var = tk.StringVar(value="default")
- format_vars = {
- "jpg": tk.BooleanVar(value=True),
- "png": tk.BooleanVar(value=True),
- "gif": tk.BooleanVar(),
- "bmp": tk.BooleanVar(),
- "webp": tk.BooleanVar(),
- }
- def update_query_preview():
- selected_formats = [f for f, var in format_vars.items() if var.get()]
- custom_format = extra_format_var.get().strip()
- keyword = keyword_var.get().strip()
- site = site_var.get().strip()
- all_formats = selected_formats.copy()
- if custom_format:
- all_formats.append(custom_format)
- filetype_str = ' | '.join(all_formats) if all_formats else "jpg | png"
- query = 'intitle:"index of"'
- if keyword:
- query += f' ({keyword})'
- query += f' ({filetype_str})'
- if site:
- query += f' site:{site}'
- query_preview_var.set(query)
- def perform_search():
- query = query_preview_var.get()
- if not query:
- result_text.delete("1.0", tk.END)
- result_text.insert(tk.END, "⚠️ Query is empty.")
- return
- url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
- result_text.delete("1.0", tk.END)
- result_text.insert(tk.END, f"🔍 Google Search URL:\n{url}")
- browser = browser_var.get()
- try:
- if browser == "chrome":
- webbrowser.get("chrome").open(url)
- elif browser == "firefox":
- webbrowser.get("firefox").open(url)
- else:
- webbrowser.open(url)
- except webbrowser.Error:
- result_text.insert(tk.END, f"\n⚠️ Failed to open {browser}, using default browser instead.")
- webbrowser.open(url)
- def toggle_dark_mode():
- dark = dark_mode_var.get()
- bg = "#1e1e1e" if dark else "#ffffff"
- fg = "#ffffff" if dark else "#000000"
- widgets = [tab1, format_frame, keyword_label, keyword_entry,
- site_label, site_entry, extra_label, extra_entry,
- preview_label, preview_entry, search_button, dark_mode_check,
- browser_label, result_label, result_text]
- for widget in widgets:
- try:
- widget.config(bg=bg, fg=fg)
- except:
- pass
- keyword_entry.config(insertbackground=fg)
- site_entry.config(insertbackground=fg)
- extra_entry.config(insertbackground=fg)
- result_text.config(insertbackground=fg)
- # Tab 1 Layout
- tk.Label(tab1, text="Select Image Formats:", bg="#ffffff").pack(anchor="w", padx=10, pady=5)
- format_frame = tk.Frame(tab1, bg="#ffffff")
- format_frame.pack(anchor="w", padx=20)
- for fmt, var in format_vars.items():
- cb = tk.Checkbutton(format_frame, text=fmt, variable=var, bg="#ffffff", command=update_query_preview)
- cb.pack(side="left", padx=5)
- extra_label = tk.Label(tab1, text="Type any extra format or word (e.g. tif, raw):", bg="#ffffff")
- extra_label.pack(anchor="w", padx=10, pady=5)
- extra_entry = tk.Entry(tab1, textvariable=extra_format_var, width=60, bg="#ffffff", fg="#000000")
- extra_entry.pack(padx=10)
- extra_entry.bind("<KeyRelease>", lambda e: update_query_preview())
- keyword_label = tk.Label(tab1, text="Enter Keywords (e.g. wallpaper | backgrounds):", bg="#ffffff")
- keyword_label.pack(anchor="w", padx=10, pady=5)
- keyword_entry = tk.Entry(tab1, textvariable=keyword_var, width=60, bg="#ffffff", fg="#000000")
- keyword_entry.pack(padx=10)
- keyword_entry.bind("<KeyRelease>", lambda e: update_query_preview())
- site_label = tk.Label(tab1, text="Optional Site Filter (e.g. .edu, example.com):", bg="#ffffff")
- site_label.pack(anchor="w", padx=10, pady=5)
- site_entry = tk.Entry(tab1, textvariable=site_var, width=60, bg="#ffffff", fg="#000000")
- site_entry.pack(padx=10)
- site_entry.bind("<KeyRelease>", lambda e: update_query_preview())
- preview_label = tk.Label(tab1, text="🔎 Search Query Preview:", bg="#ffffff", font=("Arial", 10, "bold"))
- preview_label.pack(anchor="w", padx=10, pady=5)
- preview_entry = tk.Entry(tab1, textvariable=query_preview_var, width=80, state="readonly", bg="#eeeeee")
- preview_entry.pack(padx=10, pady=5)
- browser_label = tk.Label(tab1, text="Select Browser:", bg="#ffffff")
- browser_label.pack(anchor="w", padx=10, pady=5)
- browser_frame = tk.Frame(tab1, bg="#ffffff")
- browser_frame.pack(anchor="w", padx=20)
- tk.Radiobutton(browser_frame, text="Default", variable=browser_var, value="default", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
- tk.Radiobutton(browser_frame, text="Chrome", variable=browser_var, value="chrome", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
- tk.Radiobutton(browser_frame, text="Firefox", variable=browser_var, value="firefox", bg="#ffffff", command=update_query_preview).pack(side="left", padx=10)
- search_button = tk.Button(tab1, text="Search on Google", command=perform_search)
- search_button.pack(pady=10)
- dark_mode_check = tk.Checkbutton(tab1, text="Dark Mode", variable=dark_mode_var, command=toggle_dark_mode, bg="#ffffff")
- dark_mode_check.pack()
- result_label = tk.Label(tab1, text="Generated Google Search URL:", bg="#ffffff")
- result_label.pack(anchor="w", padx=10, pady=5)
- result_text = tk.Text(tab1, height=4, width=80, wrap="word", bg="#f8f8f8")
- result_text.pack(padx=10, pady=5)
- update_query_preview()
- # ====================
- # === Tab 2 Content ===
- # ====================
- media_urls = []
- special_sites = ['youtube.com', 'youtu.be', 'facebook.com', 'fb.watch', 'tiktok.com', 'instagram.com']
- image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.ico']
- video_exts = ['.mp4', '.webm', '.ogg', '.mov', '.avi', '.mkv', '.flv', '.3gp', '.wmv', '.m3u', '.m3u8']
- stop_download_flag = False
- def is_special_site(url):
- return any(domain in url for domain in special_sites)
- def browse_url_file():
- file_path = filedialog.askopenfilename(title="Open URL File", filetypes=[("Text files", "*.txt")])
- if file_path:
- with open(file_path, 'r') as f:
- for line in f:
- url = line.strip()
- if url and url not in media_urls:
- media_urls.append(url)
- result_box.insert(tk.END, url + "\n")
- def save_urls_to_file():
- file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
- if file_path:
- with open(file_path, 'w') as f:
- f.write(result_box.get("1.0", tk.END).strip())
- messagebox.showinfo("Saved", f"URLs saved to {file_path}")
- def scrape_normal_site(url):
- found_urls = set()
- try:
- response = requests.get(url, timeout=10)
- if response.status_code != 200:
- return found_urls
- soup = BeautifulSoup(response.text, 'html.parser')
- for tag in soup.find_all(['img', 'video', 'source', 'a']):
- src = tag.get('src') or tag.get('href')
- if src:
- full_url = urljoin(url, src)
- parsed = urlparse(full_url)
- ext = os.path.splitext(parsed.path)[1].lower()
- if ext in image_exts + video_exts:
- found_urls.add(full_url)
- except Exception:
- pass
- return found_urls
- def process_url():
- url = url_entry.get().strip()
- if not url:
- messagebox.showwarning("Input Error", "Please enter a valid URL.")
- return
- media_urls.clear()
- result_box.delete("1.0", tk.END)
- try:
- if is_special_site(url):
- ydl_opts = {
- 'quiet': True,
- 'skip_download': True,
- 'force_generic_extractor': False
- }
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
- info = ydl.extract_info(url, download=False)
- if 'entries' in info:
- for entry in info['entries']:
- media_urls.append(entry['webpage_url'])
- result_box.insert(tk.END, entry['webpage_url'] + "\n")
- else:
- media_urls.append(info['webpage_url'])
- result_box.insert(tk.END, info['webpage_url'] + "\n")
- else:
- scraped = scrape_normal_site(url)
- media_urls.extend(scraped)
- for media_url in scraped:
- result_box.insert(tk.END, media_url + "\n")
- if not media_urls:
- messagebox.showinfo("Info", "No media URLs found.")
- else:
- messagebox.showinfo("Success", f"{len(media_urls)} media URL(s) found!")
- except Exception as e:
- messagebox.showerror("Error", str(e))
- def download_media(url, save_path):
- try:
- if is_special_site(url):
- ytdlp_path = shutil.which("yt-dlp") or r"C:\Windows\yt-dlp.exe"
- command = [
- ytdlp_path,
- "-f", "best",
- "--no-playlist",
- "--extractor-args", "youtube:player_client=web",
- "-o", save_path,
- url
- ]
- result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
- if result.returncode != 0:
- raise Exception(result.stderr.strip())
- else:
- response = requests.get(url, stream=True)
- if response.status_code == 200:
- with open(save_path, 'wb') as f:
- for chunk in response.iter_content(1024):
- f.write(chunk)
- except Exception as e:
- messagebox.showerror("Download Error", f"Failed to download:\n{url}\n{str(e)}")
- def download_selected_line():
- try:
- line_index = result_box.index(tk.INSERT).split(".")[0]
- selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
- if not selected_url:
- raise Exception("No line selected.")
- folder = filedialog.askdirectory(title="Select Folder to Save File")
- if not folder:
- return
- parsed = urlparse(selected_url)
- filename = os.path.basename(parsed.path)
- if not filename:
- filename = "downloaded_file"
- save_path = os.path.join(folder, filename)
- threading.Thread(target=threaded_download, args=(selected_url, save_path), daemon=True).start()
- except Exception as e:
- messagebox.showerror("Error", str(e))
- def download_selected():
- selected_urls = result_box.get("1.0", tk.END).strip().splitlines()
- if not selected_urls:
- messagebox.showwarning("Selection Error", "No URLs to download.")
- return
- selected = filedialog.askdirectory(title="Select Folder to Save Files")
- if not selected:
- return
- for url in selected_urls:
- parsed = urlparse(url)
- filename = os.path.basename(parsed.path)
- if not filename:
- filename = "downloaded_file.mp4"
- save_path = os.path.join(selected, filename)
- download_media(url, save_path)
- messagebox.showinfo("Download Complete", f"Downloaded {len(selected_urls)} media files.")
- def threaded_download(url, save_path):
- global stop_download_flag
- stop_download_flag = False
- try:
- if is_special_site(url):
- ytdlp_path = shutil.which("yt-dlp") or r"C:\Windows\yt-dlp.exe"
- command = [
- ytdlp_path,
- "-f", "mp4",
- "--no-part", # Saves directly as .mp4
- "--downloader", "ffmpeg",
- "--downloader-args", "ffmpeg_i:-movflags +faststart",
- "-o", save_path,
- url
- ]
- proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
- while proc.poll() is None:
- if stop_download_flag:
- proc.kill()
- break
- else:
- response = requests.get(url, stream=True, timeout=10)
- if response.status_code == 200:
- with open(save_path, 'wb') as f:
- for chunk in response.iter_content(1024 * 1024): # 1MB
- if stop_download_flag:
- break
- if chunk:
- f.write(chunk)
- if stop_download_flag:
- fix_partial_video(save_path) # Try to repair it
- messagebox.showinfo("Download Stopped", f"Download was stopped by user.\nSaved: {save_path}")
- else:
- messagebox.showinfo("Download Complete", f"Downloaded successfully to:\n{save_path}")
- except Exception as e:
- messagebox.showerror("Download Error", str(e))
- def stop_download():
- global stop_download_flag
- stop_download_flag = True
- def fix_partial_video(input_path):
- try:
- if not os.path.exists(input_path) or not input_path.lower().endswith(".mp4"):
- return
- output_path = input_path.replace(".mp4", "_fixed.mp4")
- ffmpeg_path = shutil.which("ffmpeg") or r"C:\Program Files\ffmpeg\bin\ffmpeg.exe"
- # Try quick remux
- command = [
- ffmpeg_path,
- "-y",
- "-i", input_path,
- "-c", "copy",
- "-movflags", "+faststart",
- output_path
- ]
- result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
- # Fallback to re-encode if remux fails or small file
- if result.returncode != 0 or not os.path.exists(output_path) or os.path.getsize(output_path) < 1024 * 1024:
- print("[INFO] Remux failed or file too small, retrying with re-encode...")
- command = [
- ffmpeg_path,
- "-y",
- "-i", input_path,
- "-preset", "ultrafast",
- output_path
- ]
- subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
- # Replace original file if fixed
- if os.path.exists(output_path):
- os.remove(input_path)
- os.rename(output_path, input_path)
- except Exception as e:
- print(f"[FFmpeg Fix Error] {e}")
- def scrape_all_links(url):
- try:
- response = requests.get(url, timeout=10)
- response.raise_for_status()
- soup = BeautifulSoup(response.text, 'html.parser')
- links = []
- for tag in soup.find_all('a', href=True):
- href = tag['href']
- full_url = urljoin(url, href)
- parsed_url = urlparse(full_url)
- if parsed_url.scheme in ['http', 'https']:
- links.append(full_url)
- return links
- except requests.exceptions.RequestException as e:
- messagebox.showerror("Network Error", f"Failed to scrape links: {e}")
- return []
- except Exception as e:
- messagebox.showerror("Error", f"An unexpected error occurred: {e}")
- return []
- def scrape_all_button():
- url = url_entry.get().strip()
- if not url:
- messagebox.showwarning("Input Error", "Please enter a valid URL.")
- return
- result_box.delete("1.0", tk.END)
- media_urls.clear()
- all_links = scrape_all_links(url)
- media_urls.extend(all_links)
- for link in all_links:
- result_box.insert(tk.END, link + "\n")
- messagebox.showinfo("Done", f"{len(all_links)} total link(s) scraped.")
- def open_in_vlc():
- line_index = result_box.index(tk.INSERT).split(".")[0]
- selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
- if not selected_url:
- messagebox.showwarning("No Selection", "Select a valid media URL.")
- return
- #vlc_path = shutil.which("vlc")
- vlc_path = r"C:\Program Files\VideoLAN\VLC\vlc.exe"
- if not vlc_path:
- messagebox.showerror("VLC Error", "VLC is not installed or not found in PATH.")
- return
- try:
- subprocess.Popen([vlc_path, selected_url])
- except Exception as e:
- messagebox.showerror("VLC Error", f"Could not open VLC: {e}")
- def preview_image_popup():
- try:
- line_index = result_box.index(tk.INSERT).split(".")[0]
- selected_url = result_box.get(f"{line_index}.0", f"{line_index}.end").strip()
- if not selected_url.lower().endswith(tuple(image_exts)):
- messagebox.showerror("Preview Error", "Selected link is not an image.")
- return
- response = requests.get(selected_url, timeout=10)
- if response.status_code != 200:
- messagebox.showerror("Preview Error", "Failed to load image.")
- return
- image = Image.open(io.BytesIO(response.content))
- popup = tk.Toplevel(window)
- popup.title("Image Preview")
- popup.geometry("600x600")
- img_resized = image.resize((500, 500), Image.ANTIALIAS)
- img_tk = ImageTk.PhotoImage(img_resized)
- label = tk.Label(popup, image=img_tk)
- label.image = img_tk
- label.pack()
- except Exception as e:
- messagebox.showerror("Preview Error", str(e))
- def load_m3u_file():
- file_path = filedialog.askopenfilename(title="Open M3U File", filetypes=[("M3U/M3U8 Files", "*.m3u *.m3u8")])
- if file_path:
- result_box.delete("1.0", tk.END)
- media_urls.clear()
- with open(file_path, 'r', encoding="utf-8", errors="ignore") as f:
- for line in f:
- url = line.strip()
- if url and url.startswith("http"):
- media_urls.append(url)
- result_box.insert(tk.END, url + "\n")
- messagebox.showinfo("Loaded", f"{len(media_urls)} media URLs loaded from playlist.")
- def load_online_m3u():
- url = url_entry.get().strip()
- if not url.lower().endswith((".m3u", ".m3u8")):
- messagebox.showwarning("URL Error", "Please enter a valid .m3u or .m3u8 URL.")
- return
- try:
- response = requests.get(url, timeout=10)
- if response.status_code != 200:
- raise Exception("Unable to fetch playlist.")
- result_box.delete("1.0", tk.END)
- media_urls.clear()
- for line in response.text.splitlines():
- line = line.strip()
- if line and line.startswith("http"):
- media_urls.append(line)
- result_box.insert(tk.END, line + "\n")
- messagebox.showinfo("Online M3U Loaded", f"{len(media_urls)} stream(s) loaded.")
- except Exception as e:
- messagebox.showerror("Error", str(e))
- def scrape_xtream_m3u_url():
- url = url_entry.get().strip()
- if not url or "get.php" not in url:
- messagebox.showwarning("Input Error", "Please enter a valid Xtream M3U URL.")
- return
- try:
- headers = {
- "User-Agent": "VLC/3.0.18 LibVLC/3.0.18"
- }
- response = requests.get(url, headers=headers, timeout=15)
- if response.status_code == 404:
- raise Exception("404 Not Found — the playlist URL might be wrong or expired.")
- if response.status_code != 200:
- raise Exception(f"Failed to fetch playlist. Status code: {response.status_code}")
- content = response.text
- if "#EXTM3U" not in content:
- raise Exception("Invalid playlist. No M3U content found.")
- result_box.delete("1.0", tk.END)
- media_urls.clear()
- for line in content.splitlines():
- if line.startswith("http"):
- media_urls.append(line)
- result_box.insert(tk.END, line + "\n")
- if media_urls:
- messagebox.showinfo("Success", f"Scraped {len(media_urls)} stream URLs from Xtream playlist.")
- else:
- messagebox.showwarning("No URLs", "Playlist loaded, but no stream URLs found.")
- except Exception as e:
- messagebox.showerror("Error", str(e))
- def search_urls():
- query = search_entry.get().strip().lower()
- if not query:
- return
- result_box.tag_remove("highlight", "1.0", tk.END)
- lines = result_box.get("1.0", tk.END).splitlines()
- for i, line in enumerate(lines, 1):
- if query in line.lower():
- result_box.tag_add("highlight", f"{i}.0", f"{i}.end")
- result_box.tag_config("highlight", background="yellow", foreground="black")
- def save_as_m3u():
- """
- Saves the contents of the result box as an M3U/M3U8 playlist file.
- """
- file_path = filedialog.asksaveasfilename(
- defaultextension=".m3u",
- filetypes=[("Text File", "*.txt"), ("M3U Playlist", "*.m3u"), ("M3U8 Playlist", "*.m3u8")]
- )
- if file_path:
- try:
- with open(file_path, 'w', encoding="utf-8") as f:
- # Write content from the result box to the file
- f.write(result_box.get("1.0", tk.END).strip())
- messagebox.showinfo("Saved", f"Playlist saved to:\n{file_path}")
- except Exception as e:
- messagebox.showerror("Save Error", f"Failed to save playlist:\n{str(e)}")
- def clear_url_field():
- """
- Clears the URL entry field.
- """
- url_entry.delete(0, tk.END)
- def clear_result_box():
- """
- Clears the result box and resets the media URLs list.
- """
- result_box.delete("1.0", tk.END)
- media_urls.clear()
- def clear_search():
- """
- Clears the search entry field and removes highlights from the result box.
- """
- search_entry.delete(0, tk.END)
- result_box.tag_remove("highlight", "1.0", tk.END)
- def scrape_directory_media(url):
- """
- Scrape media URLs from subdirectories of the given URL.
- :param url: The base URL to start scraping from.
- """
- global media_urls
- result_box.delete("1.0", tk.END) # Fix: Replace END with tk.END
- media_urls.clear()
- def extract_directories(soup, base_url):
- """
- Extract directory links from the page.
- :param soup: BeautifulSoup object of the page.
- :param base_url: Base URL to resolve relative paths.
- :return: List of directory URLs.
- """
- directories = []
- for a_tag in soup.find_all('a', href=True):
- href = a_tag['href']
- if href.endswith("/") and not href.startswith("#"): # Subdirectory link
- full_href = urljoin(base_url, href)
- if full_href != base_url: # Avoid infinite loops
- directories.append(full_href)
- return directories
- def extract_media_urls(soup, base_url):
- """
- Extract media URLs from the page.
- :param soup: BeautifulSoup object of the page.
- :param base_url: Base URL to resolve relative paths.
- :return: Set of media URLs.
- """
- media_links = set()
- for tag in soup.find_all(['img', 'video', 'source', 'a']):
- src = tag.get('src') or tag.get('href')
- if src:
- full_url = urljoin(base_url, src)
- parsed = urlparse(full_url)
- ext = os.path.splitext(parsed.path)[1].lower()
- if ext in image_exts + video_exts:
- media_links.add(full_url)
- return media_links
- try:
- # Fetch the base URL content
- response = requests.get(url, timeout=10)
- if response.status_code != 200:
- messagebox.showerror("Error", f"Failed to fetch {url} (Status Code: {response.status_code})")
- return
- soup = BeautifulSoup(response.text, 'html.parser')
- # Step 1: Extract all subdirectories
- directories = extract_directories(soup, url)
- # Step 2: Scrape media URLs from each subdirectory
- found_media = False
- for directory in directories:
- try:
- dir_response = requests.get(directory, timeout=10)
- if dir_response.status_code == 200:
- dir_soup = BeautifulSoup(dir_response.text, 'html.parser')
- media_links = extract_media_urls(dir_soup, directory)
- if media_links:
- found_media = True
- for media_url in media_links:
- if media_url not in media_urls:
- media_urls.append(media_url)
- result_box.insert(tk.END, media_url + "\n") # Fix: Replace END with tk.END
- except Exception as e:
- print(f"Error scraping directory {directory}: {e}")
- if not found_media:
- messagebox.showinfo("Info", "No media URLs found in subdirectories.")
- else:
- messagebox.showinfo("Success", f"{len(media_urls)} media URL(s) found!")
- except Exception as e:
- messagebox.showerror("Error", str(e))
- # Tab 2 Layout
- tk.Label(tab2, text="Enter URL to Scrape Media:").pack(pady=5)
- search_frame = tk.Frame(tab2)
- search_frame.pack(pady=5)
- search_entry = tk.Entry(search_frame, width=40)
- search_entry.pack(side=tk.LEFT, padx=5)
- tk.Button(search_frame, text="Search", command=search_urls, bg="lightblue").pack(side=tk.LEFT, padx=5)
- url_entry = tk.Entry(search_frame, width=100)
- url_entry.pack(pady=5)
- frame_buttons = tk.Frame(tab2)
- frame_buttons.pack(pady=5)
- tk.Button(frame_buttons, text="Scrape Media", command=process_url, bg="lightgreen", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_buttons, text="Browse URL File", command=browse_url_file, bg="lightyellow", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_buttons, text="Download All URLs", command=download_selected, bg="lightblue", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_buttons, text="Download Selected URL", command=download_selected_line, bg="orange", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_buttons, text="Save URLs to File", command=save_urls_to_file, bg="lightgray", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_buttons, text="Stop Download", command=stop_download, bg="red", width=20).pack(side=tk.LEFT, padx=5)
- frame_button = tk.Frame(tab2)
- frame_button.pack(pady=5)
- tk.Button(frame_button, text="Scrape All Links", command=scrape_all_button, bg="#e0c3fc", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_button, text="Open in VLC", command=open_in_vlc, bg="#c1f0c1", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_button, text="Preview Image", command=preview_image_popup, bg="#f0c1c1", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_button, text="Load Online M3U", command=load_online_m3u, bg="#c9f2ff", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_button, text="Scrape Xtream M3U", command=scrape_xtream_m3u_url, bg="#fff0b3", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_button, text="Load M3U File", command=load_m3u_file, bg="#d0f0fd", width=20).pack(side=tk.LEFT, padx=5)
- result_frame = tk.Frame(tab2)
- result_frame.pack(pady=5)
- scrollbar = tk.Scrollbar(result_frame)
- scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
- result_box = tk.Text(result_frame, height=30, width=124, yscrollcommand=scrollbar.set)
- result_box.pack(side=tk.LEFT, fill=tk.BOTH)
- scrollbar.config(command=result_box.yview)
- frame_clear = tk.Frame(tab2)
- frame_clear.pack(pady=5)
- tk.Button(frame_clear, text="Save Result", command=save_as_m3u, bg="#a7ffcc", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_clear, text="Clear Search", command=clear_search, bg="lightgray").pack(side=tk.LEFT, padx=2)
- tk.Button(frame_clear, text="Clear URL Field", command=clear_url_field, bg="#ffd580", width=20).pack(side=tk.LEFT, padx=5)
- tk.Button(frame_clear, text="Clear Result Field", command=clear_result_box, bg="#ffb3b3", width=20).pack(side=tk.LEFT, padx=5)
- # Add a button for scraping subdirectories
- tk.Button(frame_clear, text="Scrape Subdirectories", command=lambda: scrape_directory_media(url_entry.get().strip()), bg="#ffcccb", width=20).pack(side=tk.LEFT, padx=5)
- # ====================
- # === Tab 3 Content ===
- # ====================
- tab3 = ttk.Frame(notebook)
- notebook.add(tab3, text="Web Scraper")
- notebook.pack(expand=True, fill="both")
- class WebScraperGUI:
- def __init__(self, root):
- self.root = root
- # Configure the style for ttk.Frame
- self.style = ttk.Style()
- self.style.configure("Background.TFrame", background="#336699") # Define a custom style
- self.root.config(style="Background.TFrame") # Apply the style to the root frame
- # URL Entry
- self.url_label = ttk.Label(root, text="Enter URL:")
- self.url_label.grid(column=0, row=0, sticky=tk.W, padx=10, pady=5)
- self.url_entry = ttk.Entry(root, width=120)
- self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W, padx=10, pady=5)
- # Options
- self.options_label = ttk.Label(root, text="Select Options:")
- self.options_label.grid(column=0, row=1, sticky=tk.W, padx=10, pady=5)
- # Checkboxes
- self.check_var_html = tk.BooleanVar()
- self.check_var_heading = tk.BooleanVar()
- self.check_var_paragraph = tk.BooleanVar()
- self.check_var_css = tk.BooleanVar()
- self.check_var_table = tk.BooleanVar()
- self.check_var_links = tk.BooleanVar()
- self.check_var_files = tk.BooleanVar()
- self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
- self.html_check.grid(column=1, row=1, sticky=tk.W, padx=10, pady=5)
- self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
- self.heading_check.grid(column=2, row=1, sticky=tk.W, padx=10, pady=5)
- self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
- self.paragraph_check.grid(column=3, row=1, sticky=tk.W, padx=10, pady=5)
- self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
- self.css_check.grid(column=4, row=1, sticky=tk.W, padx=10, pady=5)
- self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
- self.table_check.grid(column=1, row=2, sticky=tk.W, padx=10, pady=5)
- self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
- self.links_check.grid(column=2, row=2, sticky=tk.W, padx=10, pady=5)
- self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
- self.files_check.grid(column=3, row=2, sticky=tk.W, padx=10, pady=5)
- # Result Text Field
- self.result_label = ttk.Label(root, text="Scraped Content of Websites:")
- self.result_label.grid(column=0, row=4, sticky=tk.W, padx=10, pady=5)
- #self.result_text = scrolledtext.ScrolledText(root, width=110, height=33, wrap=tk.WORD)
- self.result_text = scrolledtext.ScrolledText(root, width=116, height=33, wrap=tk.WORD, bg="#f0f0f0")
- self.result_text.grid(column=0, row=5, columnspan=5)
- # Scrape Button
- self.scrape_button = ttk.Button(root, text="SCRAPE", command=self.scrape)
- self.scrape_button.grid(column=4, row=4, columnspan=8, pady=10)
- # Save Result Button
- self.save_result_button = ttk.Button(root, text="Save Result", command=self.save_result, style='Red.TButton')
- self.save_result_button.grid(column=2, row=4, columnspan=8, pady=10)
- # Define style for the "Save Result" button
- self.style.configure('Red.TButton', foreground='red')
- def scrape(self):
- url = self.url_entry.get()
- if not url:
- messagebox.showwarning("Input Error", "Please enter a valid URL.")
- return
- options = {
- 'html': self.check_var_html.get(),
- 'heading': self.check_var_heading.get(),
- 'paragraph': self.check_var_paragraph.get(),
- 'css': self.check_var_css.get(),
- 'table': self.check_var_table.get(),
- 'links': self.check_var_links.get(),
- 'files': self.check_var_files.get()
- }
- try:
- response = requests.get(url)
- response.raise_for_status()
- soup = BeautifulSoup(response.content, 'html.parser')
- result = ""
- if options['html']:
- result += str(soup) + '\n\n'
- if options['heading']:
- headings = soup.find_all(re.compile('^h[1-6]$'))
- result += "Headings:\n"
- for heading in headings:
- result += heading.text.strip() + '\n'
- result += '\n'
- if options['paragraph']:
- paragraphs = soup.find_all('p')
- result += "Paragraphs:\n"
- for paragraph in paragraphs:
- result += paragraph.text.strip() + '\n'
- result += '\n'
- if options['css']:
- css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
- result += "CSS Links:\n"
- for css_link in css_links:
- full_url = urljoin(url, css_link)
- result += full_url + '\n'
- result += '\n'
- if options['table']:
- tables = soup.find_all('table')
- result += "Tables:\n"
- for table in tables:
- result += str(table) + '\n'
- result += '\n'
- if options['links']:
- links = soup.find_all('a', href=True)
- result += "Links:\n"
- for link in links:
- if link['href'].startswith('http'):
- result += f"Text: {link.text.strip()}, URL: {link['href']}\n"
- else:
- full_url = urljoin(url, link['href'])
- result += f"Text: {link.text.strip()}, URL: {full_url}\n"
- result += '\n'
- if options['files']:
- file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.[^.]+$', link['href'])]
- result += "File Links:\n"
- for file_link in file_links:
- full_url = urljoin(url, file_link)
- result += full_url + '\n'
- result += '\n'
- self.result_text.delete(1.0, tk.END)
- self.result_text.insert(tk.END, result)
- except requests.exceptions.RequestException as e:
- messagebox.showerror("Network Error", f"Failed to fetch URL: {e}")
- except Exception as e:
- messagebox.showerror("Error", f"An unexpected error occurred: {e}")
- def save_result(self):
- result_text = self.result_text.get(1.0, tk.END)
- if not result_text.strip():
- messagebox.showwarning("Empty Result", "No content to save.")
- return
- file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
- if file_path:
- try:
- with open(file_path, "w", encoding="utf-8") as file:
- file.write(result_text)
- messagebox.showinfo("Success", f"Result saved to {file_path}")
- except Exception as e:
- messagebox.showerror("Save Error", f"Failed to save file: {e}")
- # Initialize WebScraperGUI in Tab 3
- web_scraper_gui = WebScraperGUI(tab3)
- # Run
- window.mainloop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement