WEB-SCRAPING-ALL.py

import tkinter as tk
from tkinter import ttk
from tkinter import scrolledtext
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

class WebScraperGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("Najeeb All Web Scraper")

        # Apply a theme
        self.style = ttk.Style()
        self.style.theme_use('clam')  # You can change 'clam' to other available themes

        # URL Entry
        self.url_label = ttk.Label(root, text="Enter URL:")
        self.url_label.grid(column=0, row=0, sticky=tk.W)
        self.url_entry = ttk.Entry(root, width=50)
        self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W)

        # Options
        self.options_label = ttk.Label(root, text="Select Options:")
        self.options_label.grid(column=0, row=1, sticky=tk.W)

        # Checkboxes
        self.check_var_html = tk.BooleanVar()
        self.check_var_heading = tk.BooleanVar()
        self.check_var_paragraph = tk.BooleanVar()
        self.check_var_css = tk.BooleanVar()
        self.check_var_table = tk.BooleanVar()
        self.check_var_links = tk.BooleanVar()
        self.check_var_files = tk.BooleanVar()

        self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
        self.html_check.grid(column=1, row=1, sticky=tk.W)

        self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
        self.heading_check.grid(column=2, row=1, sticky=tk.W)

        self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
        self.paragraph_check.grid(column=3, row=1, sticky=tk.W)

        self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
        self.css_check.grid(column=4, row=1, sticky=tk.W)

        self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
        self.table_check.grid(column=1, row=2, sticky=tk.W)

        self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
        self.links_check.grid(column=2, row=2, sticky=tk.W)

        self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
        self.files_check.grid(column=3, row=2, sticky=tk.W)

        # Scrape Button
        self.scrape_button = ttk.Button(root, text="Scrape", command=self.scrape)
        self.scrape_button.grid(column=1, row=3, columnspan=4, pady=5)

        # Result Text Field
        self.result_label = ttk.Label(root, text="Scraped Content:")
        self.result_label.grid(column=0, row=4, sticky=tk.W)

        self.result_text = scrolledtext.ScrolledText(root, width=70, height=15, wrap=tk.WORD)
        self.result_text.grid(column=0, row=5, columnspan=5)

    def scrape(self):
        url = self.url_entry.get()
        if not url:
            return

        options = {
            'html': self.check_var_html.get(),
            'heading': self.check_var_heading.get(),
            'paragraph': self.check_var_paragraph.get(),
            'css': self.check_var_css.get(),
            'table': self.check_var_table.get(),
            'links': self.check_var_links.get(),
            'files': self.check_var_files.get()
        }

        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        result = ""
        if options['html']:
            result += str(soup) + '\n\n'

        if options['heading']:
            headings = soup.find_all(re.compile('^h[1-6]$'))
            for heading in headings:
                result += heading.text + '\n'
            result += '\n'

        if options['paragraph']:
            paragraphs = soup.find_all('p')
            for paragraph in paragraphs:
                result += paragraph.text + '\n'
            result += '\n'

        if options['css']:
            css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
            result += "CSS Links:\n"
            for css_link in css_links:
                full_url = urljoin(url, css_link)
                result += full_url + '\n'
            result += '\n'

        if options['table']:
            tables = soup.find_all('table')
            result += "Tables:\n"
            for table in tables:
                result += str(table) + '\n'
            result += '\n'

        if options['links']:
            links = soup.find_all('a', href=True)
            result += "Links:\n"
            for link in links:
                if link['href'].startswith('http'):
                    result += f"Text: {link.text}, URL: {link['href']}\n"
                else:
                    full_url = urljoin(url, link['href'])
                    result += f"Text: {link.text}, URL: {full_url}\n"
            result += '\n'

        if options['files']:
            try:
                file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.(jpg|jpeg|png|gif|mp4|mp3|pdf|js)$', link['href'])]
                result += "File Links:\n"
                for file_link in file_links:
                    full_url = urljoin(url, file_link)
                    result += full_url + '\n'
                result += '\n'
            except AttributeError as e:
                result += f"Error occurred while fetching file links: {e}\n\n"

        self.result_text.delete(1.0, tk.END)
        self.result_text.insert(tk.END, result)

def main():
    root = tk.Tk()
    app = WebScraperGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()