Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # Filename: ipp3_2_trigrams.py
- # Version: 1.0.0
- # Author: Jeoi Reqi
- """
- Description:
- - This script demonstrates "Chapter 3: Practice Project #2 - Trigrams" from the book "Impractical Python Projects" by Lee Vaughan.
- - The script generates letter triplets (trigrams) from a given name and finds their frequency in a dictionary of English words.
- Requirements:
- - Python 3.x
- - The following modules:
- - re
- - sys
- - requests
- - collections
- - itertools
- - typing
- Functions:
- - download_dictionary:
- Downloads a dictionary file from a given URL and saves it alphabetically.
- - load_dictionary:
- Loads a dictionary file into a set of lowercase strings.
- - ensure_word_in_dictionary:
- Ensures a specific word is in the dictionary, adding it if necessary.
- - generate_trigrams:
- Generates unique letter triplets (trigrams) from a given name.
- - count_trigram_frequency:
- Counts the frequency of trigrams in a list of words.
- Usage:
- - Ensure you have Python 3.x installed on your system.
- - This script can automatically download & save the 'dictionary.txt' file from a specified URL to the current working directory.
- - Run the script and follow the options menu to either use a demo name or input your own name.
- - The script generates trigrams from the input name and counts their frequency in the dictionary of English words.
- Additional Notes:
- - A dictionary.txt file is generated only if none found & saved to the current working directory when this script is run.
- - This script offers an options menu to either run a demo with the name 'Waldo' or input a custom name.
- - The script then generates trigrams from the input name and counts their frequency in a dictionary of English words.
- """
- import re
- import sys
- import os
- import requests
- from collections import defaultdict
- from itertools import permutations
- from typing import Set, DefaultDict
- def download_dictionary(url: str, file_name: str):
- """
- Download a dictionary file from a URL and save it alphabetically if it doesn't exist.
- Parameters:
- url (str): The URL from which to download the dictionary file.
- file_name (str): The name to save the downloaded file as.
- Raises:
- requests.RequestException: If an error occurs during the HTTP request.
- """
- if not os.path.exists(file_name):
- print("\nDownloading dictionary file from:\n" + "{}".format(url))
- try:
- response = requests.get(url)
- response.raise_for_status() # Check if the request was successful
- # Split the content by lines, strip whitespace, and sort alphabetically
- sorted_content = sorted(line.strip() for line in response.text.strip().split('\n'))
- # Save the sorted content to the file
- with open(file_name, 'w', encoding='utf-8') as f:
- f.write('\n'.join(sorted_content))
- except requests.RequestException as e:
- print("\nError downloading dictionary from {}: {}".format(url, e))
- sys.exit(1)
- else:
- print("\nDictionary file downloaded and saved alphabetically as: '{}'.".format(file_name))
- else:
- print("\nDictionary file '{}' already exists. Skipping download.".format(file_name))
- def load_dictionary(file: str) -> Set[str]:
- """
- Open a text file & turn contents into a set of lowercase strings.
- Parameters:
- file (str): The name of the file to open.
- Returns:
- Set[str]: A set of lowercase strings containing the words from the file.
- """
- print("\nLoading dictionary file...\n")
- try:
- with open(file, encoding='utf-8') as in_file:
- loaded_txt = in_file.read().strip().split('\n')
- loaded_set = {x.lower() for x in loaded_txt}
- return loaded_set
- except IOError as e:
- print("{}\nError opening {}. Terminating program.".format(e, file))
- sys.exit(1)
- def ensure_word_in_dictionary(word: str, dict_file: str, words: Set[str]) -> Set[str]:
- """
- Ensure a specific word is in the dictionary, adding it if necessary.
- Parameters:
- word (str): The word to ensure is in the dictionary.
- dict_file (str): The name of the dictionary file.
- words (Set[str]): The set of words loaded from the dictionary.
- Returns:
- Set[str]: The updated set of words including the specified word.
- """
- if word.lower() not in words:
- words.add(word.lower())
- sorted_words = sorted(words)
- with open(dict_file, 'w', encoding='utf-8') as out_file:
- out_file.write('\n'.join(sorted_words))
- print(f"Added '{word}' to the dictionary and sorted it alphabetically.")
- print("\nGathering Trigrams From Input Name...\n")
- return words
- def generate_trigrams(name: str) -> Set[str]:
- """
- Generate unique letter triplets (trigrams) from the given name.
- Parameters:
- name (str): The name to generate trigrams from.
- Returns:
- Set[str]: A set of unique trigrams.
- """
- name = name.lower()
- trigrams = set()
- perms = {''.join(i) for i in permutations(name)}
- for perm in perms:
- for i in range(0, len(perm) - 2):
- trigrams.add(perm[i] + perm[i + 1] + perm[i + 2])
- return trigrams
- def count_trigram_frequency(trigrams: Set[str], words: Set[str]) -> DefaultDict[str, int]:
- """
- Count the frequency of trigrams in a list of words.
- Parameters:
- trigrams (Set[str]): A set of trigrams to count.
- words (Set[str]): A set of words to search for trigrams.
- Returns:
- DefaultDict[str, int]: A dictionary with trigram frequencies.
- """
- mapped: DefaultDict[str, int] = defaultdict(int)
- for word in words:
- word = word.lower()
- for trigram in trigrams:
- if re.search(trigram, word):
- mapped[trigram] += 1
- return mapped
- def main():
- """
- Main function to run the script with an options menu.
- """
- # Define dictionary URL and file name
- dictionary_url = "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
- dictionary_file = "dictionary.txt"
- # Download dictionary if it doesn't exist
- download_dictionary(dictionary_url, dictionary_file)
- # Load dictionary into a set
- dictionary_set = load_dictionary(dictionary_file)
- print("\n:: TRIGRAMS OPTION MENU ::\n")
- print("1. Run demo with 'Waldo'")
- print("2. Input your own name")
- choice = input("\nEnter your choice (1 or 2): ")
- if choice == '1':
- name = 'Waldo'
- dictionary_set = ensure_word_in_dictionary(name, dictionary_file, dictionary_set)
- elif choice == '2':
- name = input("\nEnter your name: ").strip()
- dictionary_set = ensure_word_in_dictionary(name, dictionary_file, dictionary_set)
- else:
- print("Invalid choice. Exiting.")
- sys.exit(1)
- # Generate Trigrams
- trigrams_set = generate_trigrams(name)
- print("\nGenerated trigrams:\n")
- print(*trigrams_set, sep='\n')
- print("\nNumber of trigrams = {}\n".format(len(trigrams_set)))
- # Get Trigram Frequency
- print("Counting Trigram Frequency...")
- print("\nThis may take some time depending on the length of the input.\n\n")
- trigram_frequencies = count_trigram_frequency(trigrams_set, dictionary_set)
- print("Trigram frequency count:\n")
- for k in trigram_frequencies:
- print("{} {}".format(k, trigram_frequencies[k]))
- if __name__ == "__main__":
- main()
- print("\nExiting Program... GoodBye!\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement