Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 1. Execute a simple GET HTTP request using 'requests' library
- import requests
- from bs4 import BeautifulSoup
- url = 'https://www.bbc.com/news/business-66217641'
- # Send a GET request to the website
- response = requests.get(url)
- # 2. Create a .txt file that contains all the words
- # Check if the request was successful
- if response.status_code == 200:
- # Create a BeautifulSoup object to parse the HTML content
- soup = BeautifulSoup(response.content, 'html.parser')
- # Find all the text elements on the website (e.g., paragraphs, headers, etc.)
- text_elements = soup.find_all(text=True)
- # Filter out empty and whitespace-only elements, and exclude script tags
- sentences = [text.strip() for text in text_elements if text.strip() and text.parent.name != 'script']
- # Join the sentences with newline characters
- text_content = '\n'.join(sentences)
- # Save the text content to a .txt file
- with open('info.txt', 'w', encoding='utf-8') as file:
- file.write(text_content)
- print("Text copied from the website and saved as 'info.txt' successfully.")
- else:
- print("Failed to retrieve content from the website.")
- # 3. Open the recently-created file named 'info.txt'
- file_path = './info.txt'
- with open(file_path, 'r', encoding='utf-8') as file:
- file_contents = file.read()
- # 4. Split into lines - Delete the first 6 lines
- lines = file_contents.splitlines()
- DELETED = 6
- for _ in range(DELETED):
- lines.pop(0)
- print("First 15 rows now:", lines[0:15], sep='\n')
- # 5. Delete the word 'BBC' from every line
- bad = 'BBC'
- for index, line in enumerate(lines):
- modified = line.replace(bad, "")
- lines[index] = modified
- print("First 15 rows now:", lines[0:15], sep='\n', end='\n\n')
- print("Until now, we have a list with all the text lines in the website. We are only interested in the letters,")
- print("so we can join all the lines together into a single one!")
- # 6. Join all the lines (strings) into a new string
- sep = ''
- sentences = sep.join(lines)
- print("Last 100 characters --->", sentences[-100:])
- # 7. Keep only the letters and lowercase them
- alphabet = "abcdefghijklmnopqrstuvwxyz"
- sentences = sentences.lower()
- sentences = [char for char in sentences if char in alphabet]
- print("Last 100 characters --->", sentences[-100:])
- # 8. Create a dictionary with all the unique letters and their frequency appearance
- unique = {char : sentences.count(char) for char in set(sentences)}
- values = list(unique.values())
- SUM = sum(values)
- print("Scanned a text with {} letters:".format(SUM), end='\n\n')
- print(unique)
- sorted_unique = sorted(unique.items())
- print(sorted_unique)
- # 9. Plot the frequency appearance
- import numpy as np
- import matplotlib.pyplot as plt
- freq_app = {}
- for tup in sorted_unique:
- letter = tup[0]
- app = tup[1]
- freq_app[letter] = round(100 * app / SUM, 2)
- print(freq_app, end='\n\n')
- keys = list(freq_app.keys())
- values = list(freq_app.values())
- LEN = len(freq_app)
- plt.bar(keys, values)
- plt.title("Frequency appearance of letters")
- plt.ylabel("Frequency appearance (%)")
- plt.show()
- # 10. Sort by values (descending order)
- app_desc = sorted(freq_app.items(), key=lambda item : item[1], reverse=True)
- print(app_desc, end='\n\n')
- app_desc = dict(app_desc)
- new_keys = list(app_desc.keys())
- new_values = list(app_desc.values())
- xvalues = np.arange(0, LEN, 1)
- plt.bar(xvalues, new_values)
- plt.xticks(xvalues, new_keys)
- plt.title("Frequency appearance of letters")
- plt.ylabel("Frequency appearance (%)")
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement