Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tkinter as tk
- from tkinter import ttk, filedialog, messagebox
- from bs4 import BeautifulSoup
- import requests
- from PIL import Image, ImageTk
- import io
- from urllib.parse import urljoin, urlparse
- import os
- import subprocess
- class WebScraperApp:
- def __init__(self, root):
- self.root = root
- self.root.title("Najeeb Images & Videos Web Scraper")
- self.root.geometry("1000x700")
- self.root.configure(bg="#f0f0f0") # Light gray background
- # Top Frame for URL Entry and Buttons
- self.top_frame = ttk.Frame(self.root, style="Top.TFrame")
- self.top_frame.pack(fill=tk.X, padx=10, pady=10)
- # URL Entry Field
- self.url_label = ttk.Label(self.top_frame, text="Enter URL:", font=("Arial", 12), background="#ffffff")
- self.url_label.pack(side=tk.LEFT, padx=(0, 5))
- self.url_entry = ttk.Entry(self.top_frame, width=50, font=("Arial", 12))
- self.url_entry.pack(side=tk.LEFT, padx=(0, 10))
- # Scrape Media Button (Green)
- self.scrape_image_btn = ttk.Button(
- self.top_frame, text="Scrape Media", command=self.process_url, style="Green.TButton"
- )
- self.scrape_image_btn.pack(side=tk.LEFT, padx=(0, 5))
- # Scrape Links Button (Blue)
- self.scrape_links_btn = ttk.Button(
- self.top_frame, text="Scrape Links", command=self.scrape_links, style="Blue.TButton"
- )
- self.scrape_links_btn.pack(side=tk.LEFT, padx=(0, 5))
- # Scrape All Links Button (Orange)
- self.scrape_all_links_btn = ttk.Button(
- self.top_frame, text="Scrape All Links", command=self.scrape_all_button, style="Orange.TButton"
- )
- self.scrape_all_links_btn.pack(side=tk.LEFT, padx=(0, 5))
- # Clear Results and URL Button (Gray)
- self.clear_button = ttk.Button(
- self.top_frame, text="Clear", command=self.clear_fields_and_results, style="Gray.TButton"
- )
- self.clear_button.pack(side=tk.LEFT, padx=(0, 5))
- # Save Results Button (Red)
- self.save_results_btn = ttk.Button(
- self.top_frame, text="Save", command=self.save_results, style="Red.TButton"
- )
- self.save_results_btn.pack(side=tk.LEFT)
- # Left Frame for Results with Scrollbar
- self.left_frame = ttk.Frame(self.root, style="Left.TFrame")
- self.left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False, padx=10, pady=10)
- self.result_label = ttk.Label(self.left_frame, text="Results:", font=("Arial", 12), background="#ffffff")
- self.result_label.pack(anchor=tk.W)
- # Scrollable Listbox for Results
- self.result_scrollbar = ttk.Scrollbar(self.left_frame, orient=tk.VERTICAL)
- self.result_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
- self.result_listbox = tk.Listbox(
- self.left_frame,
- width=40,
- height=20,
- font=("Arial", 10),
- yscrollcommand=self.result_scrollbar.set
- )
- self.result_listbox.pack(fill=tk.BOTH, expand=True)
- self.result_scrollbar.config(command=self.result_listbox.yview)
- # Double-click to preview image, play video/M3U in VLC
- self.result_listbox.bind("<Double-Button-1>", self.handle_double_click)
- # Browse Button (Yellow)
- self.browse_button = ttk.Button(
- self.left_frame, text="Browse Text File", command=self.browse_text_file, style="Yellow.TButton"
- )
- self.browse_button.pack(side=tk.LEFT, padx=(0, 5))
- # Download Button in Left Frame (Purple)
- self.download_btn = ttk.Button(
- self.left_frame, text="Download Selected Image", command=self.download_image, style="Purple.TButton"
- )
- self.download_btn.pack(side=tk.LEFT)
- # Right Frame for Image Display
- self.right_frame = ttk.Frame(self.root, style="Right.TFrame")
- self.right_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True, padx=10, pady=10)
- self.image_label = ttk.Label(self.right_frame, text="Image Preview", anchor=tk.CENTER, font=("Arial", 12))
- self.image_label.pack(expand=True)
- # Variables
- self.images = [] # To store scraped image URLs
- self.videos = [] # To store scraped video URLs
- self.m3u_urls = [] # To store scraped M3U URLs
- self.links = [] # To store scraped links
- # Styling
- self.style = ttk.Style()
- self.style.configure("Top.TFrame", background="#ffffff")
- self.style.configure("Left.TFrame", background="#e0e0e0")
- self.style.configure("Right.TFrame", background="#d0d0d0")
- # Button Styles
- self.style.configure("Green.TButton", font=("Arial", 10), background="#4CAF50", foreground="black")
- self.style.configure("Blue.TButton", font=("Arial", 10), background="#008CBA", foreground="yellow")
- self.style.configure("Orange.TButton", font=("Arial", 10), background="#FF9800", foreground="green")
- self.style.configure("Purple.TButton", font=("Arial", 10), background="#9C27B0", foreground="red")
- self.style.configure("Red.TButton", font=("Arial", 10), background="#F44336", foreground="blue")
- self.style.configure("Gray.TButton", font=("Arial", 10), background="#808080", foreground="red")
- self.style.configure("Yellow.TButton", font=("Arial", 10), background="#FFEB3B", foreground="black")
- def process_url(self):
- """Process the entered URL and scrape media."""
- url = self.url_entry.get().strip()
- if not url:
- messagebox.showwarning("Input Error", "Please enter a valid URL.")
- return
- self.images.clear()
- self.videos.clear()
- self.m3u_urls.clear()
- self.result_listbox.delete(0, tk.END)
- try:
- parsed = urlparse(url)
- ext = os.path.splitext(parsed.path)[1].lower()
- if ext in ['.m3u', '.m3u8']:
- scraped_media = self.scrape_m3u_playlist(url)
- else:
- scraped_media = self.scrape_normal_site(url)
- self.images.extend(scraped_media["images"])
- self.videos.extend(scraped_media["videos"])
- self.m3u_urls.extend(scraped_media["m3u"])
- # Populate listbox with scraped media URLs
- for media_url in scraped_media["images"]:
- self.result_listbox.insert(tk.END, media_url)
- for video_url in scraped_media["videos"]:
- self.result_listbox.insert(tk.END, video_url)
- for m3u_url in scraped_media["m3u"]:
- self.result_listbox.insert(tk.END, m3u_url)
- total_media = len(self.images) + len(self.videos) + len(self.m3u_urls)
- if total_media == 0:
- messagebox.showinfo("Info", "No media URLs found.")
- else:
- messagebox.showinfo("Success", f"{total_media} media URL(s) found!")
- except Exception as e:
- messagebox.showerror("Error", str(e))
- def scrape_normal_site(self, url):
- """Scrape images, videos, and M3U URLs from a normal site."""
- found_images = set()
- found_videos = set()
- found_m3u = set()
- try:
- response = requests.get(url, timeout=10)
- if response.status_code != 200:
- return {"images": found_images, "videos": found_videos, "m3u": found_m3u}
- soup = BeautifulSoup(response.text, 'html.parser')
- for tag in soup.find_all(['img', 'video', 'source', 'a']):
- src = tag.get('src') or tag.get('href')
- if src:
- full_url = urljoin(url, src)
- parsed = urlparse(full_url)
- ext = os.path.splitext(parsed.path)[1].lower()
- if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.ico']:
- found_images.add(full_url)
- elif ext in ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv']:
- found_videos.add(full_url)
- elif ext in ['.m3u', '.m3u8']:
- found_m3u.add(full_url)
- except Exception as e:
- print(f"Error scraping site: {e}")
- return {"images": found_images, "videos": found_videos, "m3u": found_m3u}
- def scrape_m3u_playlist(self, url):
- """Scrape and parse an M3U playlist file."""
- found_m3u = set()
- try:
- response = requests.get(url, timeout=10)
- if response.status_code != 200:
- return {"images": set(), "videos": set(), "m3u": found_m3u}
- # Parse the M3U content
- lines = response.text.splitlines()
- for line in lines:
- line = line.strip()
- if line.startswith("http"):
- found_m3u.add(line)
- except Exception as e:
- print(f"Error scraping M3U playlist: {e}")
- return {"images": set(), "videos": set(), "m3u": found_m3u}
- def handle_double_click(self, event):
- """Handle double-click on a result item."""
- selected_index = self.result_listbox.curselection()
- if not selected_index:
- return
- selected_url = self.result_listbox.get(selected_index)
- parsed = urlparse(selected_url)
- ext = os.path.splitext(parsed.path)[1].lower()
- if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.ico']:
- # Display image preview
- try:
- response = requests.get(selected_url)
- image_data = Image.open(io.BytesIO(response.content))
- image_data.thumbnail((600, 600)) # Resize for display
- photo = ImageTk.PhotoImage(image_data)
- self.image_label.config(image=photo)
- self.image_label.image = photo # Keep reference to avoid garbage collection
- except Exception as e:
- print(f"Error loading image: {e}")
- elif ext in ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m3u', '.m3u8']:
- # Open video or M3U in VLC
- vlc_path = r"C:\Program Files\VideoLAN\VLC\vlc.exe"
- if os.path.exists(vlc_path):
- try:
- subprocess.Popen([vlc_path, selected_url])
- except Exception as e:
- messagebox.showerror("Error", f"Failed to open VLC: {e}")
- else:
- messagebox.showerror("Error", "VLC Media Player not found at the specified path.")
- def download_image(self):
- """Download the selected image."""
- selected_index = self.result_listbox.curselection()
- if not selected_index:
- return
- selected_url = self.result_listbox.get(selected_index)
- save_path = filedialog.asksaveasfilename(defaultextension=".jpg", filetypes=[("JPEG", "*.jpg"), ("PNG", "*.png")])
- if not save_path:
- return
- try:
- response = requests.get(selected_url)
- with open(save_path, "wb") as f:
- f.write(response.content)
- print(f"Image saved to {save_path}")
- except Exception as e:
- print(f"Error downloading image: {e}")
- def save_results(self):
- """Save the results (images, videos, M3U URLs) to a file."""
- # Get all items from the result listbox
- results = self.result_listbox.get(0, tk.END)
- if not results:
- messagebox.showwarning("No Results", "No results to save.")
- return
- # Ask user for file type (Text or M3U)
- file_type = messagebox.askquestion(
- "Save Format", "Save as Text (Yes) or M3U (No)?", icon="question"
- )
- # Open file dialog to choose save location
- file_path = filedialog.asksaveasfilename(
- defaultextension=".txt" if file_type == "yes" else ".m3u",
- filetypes=[
- ("Text File", "*.txt"),
- ("M3U Playlist", "*.m3u"),
- ],
- )
- if not file_path:
- return # User canceled the save dialog
- try:
- # Write results to the selected file
- with open(file_path, "w", encoding="utf-8") as f:
- if file_type == "yes": # Save as plain text
- f.write("\n".join(results))
- else: # Save as M3U format
- f.write("#EXTM3U\n") # M3U header
- for result in results:
- f.write(f"#EXTINF:-1,{result}\n") # Metadata line
- f.write(f"{result}\n") # URL line
- messagebox.showinfo("Success", f"Results saved to {file_path}")
- except Exception as e:
- messagebox.showerror("Error", f"Failed to save results: {e}")
- def clear_fields_and_results(self):
- """Clear the URL entry field, result listbox, and reset image preview."""
- # Clear URL entry field
- self.url_entry.delete(0, tk.END)
- # Clear result listbox
- self.result_listbox.delete(0, tk.END)
- # Reset internal storage
- self.images.clear()
- self.videos.clear()
- self.m3u_urls.clear()
- # Reset image preview
- self.image_label.config(text="Image Preview", image=None)
- self.image_label.image = None # Clear reference to avoid memory leaks
- # Notify user
- messagebox.showinfo("Cleared", "URL and results have been cleared.")
- def scrape_all_links(self, url):
- """Scrape all links from the given webpage."""
- all_links = set()
- try:
- response = requests.get(url, timeout=10)
- soup = BeautifulSoup(response.text, 'html.parser')
- for a_tag in soup.find_all('a', href=True):
- full_url = urljoin(url, a_tag['href'])
- all_links.add(full_url)
- except Exception as e:
- print(f"[Scrape All Error] {e}")
- return all_links
- def scrape_links(self):
- """Scrape all links from the given webpage."""
- url = self.url_entry.get()
- if not url:
- return
- try:
- response = requests.get(url)
- soup = BeautifulSoup(response.content, 'html.parser')
- self.links = [a['href'] for a in soup.find_all('a') if 'href' in a.attrs]
- # Clear previous results
- self.result_listbox.delete(0, tk.END)
- # Populate listbox with links
- for link in self.links:
- self.result_listbox.insert(tk.END, link)
- except Exception as e:
- print(f"Error scraping links: {e}")
- def scrape_all_button(self):
- """Scrape all links from the given webpage."""
- url = self.url_entry.get().strip()
- if not url:
- messagebox.showwarning("Input Error", "Please enter a valid URL.")
- return
- self.result_listbox.delete(0, tk.END)
- try:
- all_links = self.scrape_all_links(url)
- self.links.extend(all_links)
- # Populate listbox with links
- for link in all_links:
- self.result_listbox.insert(tk.END, link)
- messagebox.showinfo("Done", f"{len(all_links)} total link(s) scraped.")
- except Exception as e:
- messagebox.showerror("Error", str(e))
- def browse_text_file(self):
- """Open a text file and display its contents in the result listbox."""
- file_path = filedialog.askopenfilename(
- filetypes=[("Text Files", "*.txt"), ("All Files", "*.*")]
- )
- if not file_path:
- return # User canceled the file dialog
- try:
- # Clear existing results
- self.result_listbox.delete(0, tk.END)
- # Read the file and populate the listbox
- with open(file_path, "r", encoding="utf-8") as file:
- lines = file.readlines()
- for line in lines:
- line = line.strip() # Remove leading/trailing whitespace
- if line: # Ignore empty lines
- self.result_listbox.insert(tk.END, line)
- messagebox.showinfo("Success", f"Contents of '{file_path}' loaded successfully.")
- except Exception as e:
- messagebox.showerror("Error", f"Failed to load file: {e}")
- if __name__ == "__main__":
- root = tk.Tk()
- app = WebScraperApp(root)
- root.mainloop()
Add Comment
Please, Sign In to add comment