Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tkinter as tk
- from tkinter import ttk
- from tkinter import scrolledtext
- import requests
- from bs4 import BeautifulSoup
- import re
- from urllib.parse import urljoin
- class WebScraperGUI:
- def __init__(self, root):
- self.root = root
- self.root.title("Najeeb All Web Scraper")
- # Apply a theme
- self.style = ttk.Style()
- self.style.theme_use('clam') # You can change 'clam' to other available themes
- # URL Entry
- self.url_label = ttk.Label(root, text="Enter URL:")
- self.url_label.grid(column=0, row=0, sticky=tk.W)
- self.url_entry = ttk.Entry(root, width=50)
- self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W)
- # Options
- self.options_label = ttk.Label(root, text="Select Options:")
- self.options_label.grid(column=0, row=1, sticky=tk.W)
- # Checkboxes
- self.check_var_html = tk.BooleanVar()
- self.check_var_heading = tk.BooleanVar()
- self.check_var_paragraph = tk.BooleanVar()
- self.check_var_css = tk.BooleanVar()
- self.check_var_table = tk.BooleanVar()
- self.check_var_links = tk.BooleanVar()
- self.check_var_files = tk.BooleanVar()
- self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
- self.html_check.grid(column=1, row=1, sticky=tk.W)
- self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
- self.heading_check.grid(column=2, row=1, sticky=tk.W)
- self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
- self.paragraph_check.grid(column=3, row=1, sticky=tk.W)
- self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
- self.css_check.grid(column=4, row=1, sticky=tk.W)
- self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
- self.table_check.grid(column=1, row=2, sticky=tk.W)
- self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
- self.links_check.grid(column=2, row=2, sticky=tk.W)
- self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
- self.files_check.grid(column=3, row=2, sticky=tk.W)
- # Scrape Button
- self.scrape_button = ttk.Button(root, text="Scrape", command=self.scrape)
- self.scrape_button.grid(column=1, row=3, columnspan=4, pady=5)
- # Result Text Field
- self.result_label = ttk.Label(root, text="Scraped Content:")
- self.result_label.grid(column=0, row=4, sticky=tk.W)
- self.result_text = scrolledtext.ScrolledText(root, width=70, height=15, wrap=tk.WORD)
- self.result_text.grid(column=0, row=5, columnspan=5)
- def scrape(self):
- url = self.url_entry.get()
- if not url:
- return
- options = {
- 'html': self.check_var_html.get(),
- 'heading': self.check_var_heading.get(),
- 'paragraph': self.check_var_paragraph.get(),
- 'css': self.check_var_css.get(),
- 'table': self.check_var_table.get(),
- 'links': self.check_var_links.get(),
- 'files': self.check_var_files.get()
- }
- response = requests.get(url)
- soup = BeautifulSoup(response.content, 'html.parser')
- result = ""
- if options['html']:
- result += str(soup) + '\n\n'
- if options['heading']:
- headings = soup.find_all(re.compile('^h[1-6]$'))
- for heading in headings:
- result += heading.text + '\n'
- result += '\n'
- if options['paragraph']:
- paragraphs = soup.find_all('p')
- for paragraph in paragraphs:
- result += paragraph.text + '\n'
- result += '\n'
- if options['css']:
- css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
- result += "CSS Links:\n"
- for css_link in css_links:
- full_url = urljoin(url, css_link)
- result += full_url + '\n'
- result += '\n'
- if options['table']:
- tables = soup.find_all('table')
- result += "Tables:\n"
- for table in tables:
- result += str(table) + '\n'
- result += '\n'
- if options['links']:
- links = soup.find_all('a', href=True)
- result += "Links:\n"
- for link in links:
- if link['href'].startswith('http'):
- result += f"Text: {link.text}, URL: {link['href']}\n"
- else:
- full_url = urljoin(url, link['href'])
- result += f"Text: {link.text}, URL: {full_url}\n"
- result += '\n'
- if options['files']:
- try:
- file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.(jpg|jpeg|png|gif|mp4|mp3|pdf|js)$', link['href'])]
- result += "File Links:\n"
- for file_link in file_links:
- full_url = urljoin(url, file_link)
- result += full_url + '\n'
- result += '\n'
- except AttributeError as e:
- result += f"Error occurred while fetching file links: {e}\n\n"
- self.result_text.delete(1.0, tk.END)
- self.result_text.insert(tk.END, result)
- def main():
- root = tk.Tk()
- app = WebScraperGUI(root)
- root.mainloop()
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement