Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import pandas as pd
- import re
- # Function to get the webpage content
- def get_webpage_content(url):
- try:
- response = requests.get(url)
- response.raise_for_status() # Raise an error for bad responses
- return response.text
- except requests.exceptions.RequestException as e:
- print(f"Error fetching the URL: {e}")
- return None
- # Function to read keywords from a file
- def read_keywords_from_file(filename):
- try:
- with open(filename, 'r', encoding='utf-8') as file:
- keywords = [line.strip().lower() for line in file if line.strip()]
- return keywords
- except FileNotFoundError:
- print(f"File {filename} not found.")
- return []
- # Function to count keyword occurrences accurately using regular expressions
- def count_keywords(content, keywords):
- # Remove HTML tags using BeautifulSoup
- soup = BeautifulSoup(content, 'html.parser')
- text = soup.get_text().lower() # Convert text to lowercase for case-insensitive search
- # Dictionary to store keyword counts
- keyword_count = {}
- # Count occurrences of each keyword using regex
- for keyword in keywords:
- # Create a regex pattern to match the keyword as a whole word
- pattern = rf'\b{re.escape(keyword)}\b'
- count = len(re.findall(pattern, text))
- keyword_count[keyword] = count
- return keyword_count
- # Function to export the result to CSV
- def export_to_csv(keyword_count, filename):
- df = pd.DataFrame(list(keyword_count.items()), columns=['Keyword', 'Count'])
- df.to_csv(filename, index=False, encoding='utf-8')
- print(f"Keyword counts have been exported to {filename}")
- # Main execution
- if __name__ == "__main__":
- # Read keywords from the file
- keywords = read_keywords_from_file('keywords.txt')
- # URL of the webpage to analyze
- url = 'https://alllandlordcertificates.co.uk/fire-risk-assessment' # Replace with your desired URL
- # Fetch the webpage content
- content = get_webpage_content(url)
- if content and keywords:
- # Count keywords
- keyword_count = count_keywords(content, keywords)
- # Export result to CSV
- export_to_csv(keyword_count, 'keyword_counts.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement