Advertisement
Najeebsk

WEB-SCRAPING-ALL.py

Mar 18th, 2024
790
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.64 KB | None | 0 0
  1. import tkinter as tk
  2. from tkinter import ttk
  3. from tkinter import scrolledtext
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import re
  7. from urllib.parse import urljoin
  8.  
  9. class WebScraperGUI:
  10.     def __init__(self, root):
  11.         self.root = root
  12.         self.root.title("Najeeb All Web Scraper")
  13.        
  14.         # Apply a theme
  15.         self.style = ttk.Style()
  16.         self.style.theme_use('clam')  # You can change 'clam' to other available themes
  17.        
  18.         # URL Entry
  19.         self.url_label = ttk.Label(root, text="Enter URL:")
  20.         self.url_label.grid(column=0, row=0, sticky=tk.W)
  21.         self.url_entry = ttk.Entry(root, width=50)
  22.         self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W)
  23.  
  24.         # Options
  25.         self.options_label = ttk.Label(root, text="Select Options:")
  26.         self.options_label.grid(column=0, row=1, sticky=tk.W)
  27.  
  28.         # Checkboxes
  29.         self.check_var_html = tk.BooleanVar()
  30.         self.check_var_heading = tk.BooleanVar()
  31.         self.check_var_paragraph = tk.BooleanVar()
  32.         self.check_var_css = tk.BooleanVar()
  33.         self.check_var_table = tk.BooleanVar()
  34.         self.check_var_links = tk.BooleanVar()
  35.         self.check_var_files = tk.BooleanVar()
  36.  
  37.         self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
  38.         self.html_check.grid(column=1, row=1, sticky=tk.W)
  39.  
  40.         self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
  41.         self.heading_check.grid(column=2, row=1, sticky=tk.W)
  42.  
  43.         self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
  44.         self.paragraph_check.grid(column=3, row=1, sticky=tk.W)
  45.  
  46.         self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
  47.         self.css_check.grid(column=4, row=1, sticky=tk.W)
  48.  
  49.         self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
  50.         self.table_check.grid(column=1, row=2, sticky=tk.W)
  51.  
  52.         self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
  53.         self.links_check.grid(column=2, row=2, sticky=tk.W)
  54.  
  55.         self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
  56.         self.files_check.grid(column=3, row=2, sticky=tk.W)
  57.  
  58.         # Scrape Button
  59.         self.scrape_button = ttk.Button(root, text="Scrape", command=self.scrape)
  60.         self.scrape_button.grid(column=1, row=3, columnspan=4, pady=5)
  61.  
  62.         # Result Text Field
  63.         self.result_label = ttk.Label(root, text="Scraped Content:")
  64.         self.result_label.grid(column=0, row=4, sticky=tk.W)
  65.  
  66.         self.result_text = scrolledtext.ScrolledText(root, width=70, height=15, wrap=tk.WORD)
  67.         self.result_text.grid(column=0, row=5, columnspan=5)
  68.  
  69.     def scrape(self):
  70.         url = self.url_entry.get()
  71.         if not url:
  72.             return
  73.        
  74.         options = {
  75.             'html': self.check_var_html.get(),
  76.             'heading': self.check_var_heading.get(),
  77.             'paragraph': self.check_var_paragraph.get(),
  78.             'css': self.check_var_css.get(),
  79.             'table': self.check_var_table.get(),
  80.             'links': self.check_var_links.get(),
  81.             'files': self.check_var_files.get()
  82.         }
  83.        
  84.         response = requests.get(url)
  85.         soup = BeautifulSoup(response.content, 'html.parser')
  86.  
  87.         result = ""
  88.         if options['html']:
  89.             result += str(soup) + '\n\n'
  90.  
  91.         if options['heading']:
  92.             headings = soup.find_all(re.compile('^h[1-6]$'))
  93.             for heading in headings:
  94.                 result += heading.text + '\n'
  95.             result += '\n'
  96.  
  97.         if options['paragraph']:
  98.             paragraphs = soup.find_all('p')
  99.             for paragraph in paragraphs:
  100.                 result += paragraph.text + '\n'
  101.             result += '\n'
  102.  
  103.         if options['css']:
  104.             css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
  105.             result += "CSS Links:\n"
  106.             for css_link in css_links:
  107.                 full_url = urljoin(url, css_link)
  108.                 result += full_url + '\n'
  109.             result += '\n'
  110.  
  111.         if options['table']:
  112.             tables = soup.find_all('table')
  113.             result += "Tables:\n"
  114.             for table in tables:
  115.                 result += str(table) + '\n'
  116.             result += '\n'
  117.  
  118.         if options['links']:
  119.             links = soup.find_all('a', href=True)
  120.             result += "Links:\n"
  121.             for link in links:
  122.                 if link['href'].startswith('http'):
  123.                     result += f"Text: {link.text}, URL: {link['href']}\n"
  124.                 else:
  125.                     full_url = urljoin(url, link['href'])
  126.                     result += f"Text: {link.text}, URL: {full_url}\n"
  127.             result += '\n'
  128.  
  129.         if options['files']:
  130.             try:
  131.                 file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.(jpg|jpeg|png|gif|mp4|mp3|pdf|js)$', link['href'])]
  132.                 result += "File Links:\n"
  133.                 for file_link in file_links:
  134.                     full_url = urljoin(url, file_link)
  135.                     result += full_url + '\n'
  136.                 result += '\n'
  137.             except AttributeError as e:
  138.                 result += f"Error occurred while fetching file links: {e}\n\n"
  139.  
  140.         self.result_text.delete(1.0, tk.END)
  141.         self.result_text.insert(tk.END, result)
  142.  
  143. def main():
  144.     root = tk.Tk()
  145.     app = WebScraperGUI(root)
  146.     root.mainloop()
  147.  
  148. if __name__ == "__main__":
  149.     main()
  150.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement