Advertisement
Najeebsk

WEB-SCRAPING-ALL-ELEMENTS.py

Mar 18th, 2024 (edited)
876
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.55 KB | None | 0 0
  1. import tkinter as tk
  2. from tkinter import ttk
  3. from tkinter import scrolledtext
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import re
  7. from urllib.parse import urljoin
  8. from tkinter import filedialog
  9.  
  10. class WebScraperGUI:
  11.     def __init__(self, root):
  12.         self.root = root
  13.         self.root.title("Najeeb All Web Scraper")
  14.         self.root.configure(bg="#336699")
  15.         self.root.geometry("900x660")
  16.        
  17.         # Apply a theme
  18.         self.style = ttk.Style()
  19.         self.style.theme_use('clam')  # You can change 'clam' to other available themes
  20.        
  21.         # URL Entry
  22.         self.url_label = ttk.Label(root, text="Enter URL:")
  23.         self.url_label.grid(column=0, row=0, sticky=tk.W)
  24.         self.url_entry = ttk.Entry(root, width=120)
  25.         self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W)
  26.  
  27.         # Options
  28.         self.options_label = ttk.Label(root, text="Select Options:")
  29.         self.options_label.grid(column=0, row=1, sticky=tk.W)
  30.  
  31.         # Checkboxes
  32.         self.check_var_html = tk.BooleanVar()
  33.         self.check_var_heading = tk.BooleanVar()
  34.         self.check_var_paragraph = tk.BooleanVar()
  35.         self.check_var_css = tk.BooleanVar()
  36.         self.check_var_table = tk.BooleanVar()
  37.         self.check_var_links = tk.BooleanVar()
  38.         self.check_var_files = tk.BooleanVar()
  39.  
  40.         self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
  41.         self.html_check.grid(column=1, row=1, sticky=tk.W)
  42.  
  43.         self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
  44.         self.heading_check.grid(column=2, row=1, sticky=tk.W)
  45.  
  46.         self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
  47.         self.paragraph_check.grid(column=3, row=1, sticky=tk.W)
  48.  
  49.         self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
  50.         self.css_check.grid(column=4, row=1, sticky=tk.W)
  51.  
  52.         self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
  53.         self.table_check.grid(column=1, row=2, sticky=tk.W)
  54.  
  55.         self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
  56.         self.links_check.grid(column=2, row=2, sticky=tk.W)
  57.  
  58.         self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
  59.         self.files_check.grid(column=3, row=2, sticky=tk.W)
  60.  
  61.         # Scrape Button
  62.         self.scrape_button = ttk.Button(root, text="SCRAPE", command=self.scrape)
  63.         self.scrape_button.grid(column=4, row=6, columnspan=8, pady=5)
  64.  
  65.         # Save Result Button
  66.         self.save_result_button = ttk.Button(root, text="Save Result", command=self.save_result, style='Red.TButton')
  67.         self.save_result_button.grid(column=0, row=6, columnspan=8, pady=5)
  68.  
  69.         # Result Text Field
  70.         self.result_label = ttk.Label(root, text="Scraped Content of Websites:")
  71.         self.result_label.grid(column=0, row=4, sticky=tk.W)
  72.  
  73.         self.result_text = scrolledtext.ScrolledText(root, width=110, height=33, wrap=tk.WORD)
  74.         self.result_text.grid(column=0, row=5, columnspan=5)
  75.  
  76.         # Define style for the "Save Result" button
  77.         self.style.configure('Red.TButton', foreground='red')
  78.  
  79.     def scrape(self):
  80.         url = self.url_entry.get()
  81.         if not url:
  82.             return
  83.        
  84.         options = {
  85.             'html': self.check_var_html.get(),
  86.             'heading': self.check_var_heading.get(),
  87.             'paragraph': self.check_var_paragraph.get(),
  88.             'css': self.check_var_css.get(),
  89.             'table': self.check_var_table.get(),
  90.             'links': self.check_var_links.get(),
  91.             'files': self.check_var_files.get()
  92.         }
  93.        
  94.         response = requests.get(url)
  95.         soup = BeautifulSoup(response.content, 'html.parser')
  96.  
  97.         result = ""
  98.         if options['html']:
  99.             result += str(soup) + '\n\n'
  100.  
  101.         if options['heading']:
  102.             headings = soup.find_all(re.compile('^h[1-6]$'))
  103.             for heading in headings:
  104.                 result += heading.text + '\n'
  105.             result += '\n'
  106.  
  107.         if options['paragraph']:
  108.             paragraphs = soup.find_all('p')
  109.             for paragraph in paragraphs:
  110.                 result += paragraph.text + '\n'
  111.             result += '\n'
  112.  
  113.         if options['css']:
  114.             css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
  115.             result += "CSS Links:\n"
  116.             for css_link in css_links:
  117.                 full_url = urljoin(url, css_link)
  118.                 result += full_url + '\n'
  119.             result += '\n'
  120.  
  121.         if options['table']:
  122.             tables = soup.find_all('table')
  123.             result += "Tables:\n"
  124.             for table in tables:
  125.                 result += str(table) + '\n'
  126.             result += '\n'
  127.  
  128.         if options['links']:
  129.             links = soup.find_all('a', href=True)
  130.             result += "Links:\n"
  131.             for link in links:
  132.                 if link['href'].startswith('http'):
  133.                     result += f"Text: {link.text}, URL: {link['href']}\n"
  134.                 else:
  135.                     full_url = urljoin(url, link['href'])
  136.                     result += f"Text: {link.text}, URL: {full_url}\n"
  137.             result += '\n'
  138.  
  139.         if options['files']:
  140.             try:
  141.                 file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.(jpg|jpeg|png|gif|bmp|ico|mkv|avi|mp4|mp3|pdf|js|bat|py|ahk|cmd|txt|vbs|a3u|m3u|m3u8|html|js|css|htm)$', link['href'])]
  142.                 result += "File Links:\n"
  143.                 for file_link in file_links:
  144.                     full_url = urljoin(url, file_link)
  145.                     result += full_url + '\n'
  146.                 result += '\n'
  147.             except AttributeError as e:
  148.                 result += f"Error occurred while fetching file links: {e}\n\n"
  149.  
  150.         self.result_text.delete(1.0, tk.END)
  151.         self.result_text.insert(tk.END, result)
  152.  
  153.     def save_result(self):
  154.         result_text = self.result_text.get(1.0, tk.END)
  155.         if not result_text.strip():
  156.             return
  157.         file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
  158.         if file_path:
  159.             with open(file_path, "w", encoding="utf-8") as file:
  160.                 file.write(result_text)
  161.  
  162. def main():
  163.     root = tk.Tk()
  164.     app = WebScraperGUI(root)
  165.     root.mainloop()
  166.  
  167. if __name__ == "__main__":
  168.     main()
  169.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement