Advertisement
Python253

ipp3_1_bigrams

Jun 1st, 2024
486
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.75 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Filename: ipp3_1_bigrams.py
  4. # Version: 1.0.0
  5. # Author: Jeoi Reqi
  6.  
  7. """
  8. Description:
  9.    - This script demonstrates "Chapter 3: Practice Project #1 - Bigrams" from the book "Impractical Python Projects" by Lee Vaughan.
  10.    - The script generates letter pairs (bigrams) from a given name and finds their frequency in a dictionary of English words.
  11.  
  12. Requirements:
  13.    - Python 3.x
  14.    - The following modules:
  15.        - re
  16.        - sys
  17.        - requests
  18.        - collections
  19.        - itertools
  20.        - typing
  21.  
  22. Functions:
  23.    - download_dictionary:
  24.        Downloads a dictionary file from a given URL and saves it alphabetically.
  25.    - load_dictionary:
  26.        Loads a dictionary file into a set of lowercase strings.
  27.    - ensure_word_in_dictionary:
  28.        Ensures a specific word is in the dictionary, adding it if necessary.
  29.    - generate_bigrams:
  30.        Generates unique letter pairs (bigrams) from a given name.
  31.    - count_bigram_frequency:
  32.        Counts the frequency of bigrams in a list of words.
  33.  
  34. Usage:
  35.    - Ensure you have Python 3.x installed on your system.
  36.    - This script can automatically download & save the 'dictionary.txt' file from a specified URL to the current working directory.
  37.    - Run the script and follow the options menu to either use a demo name or input your own name.
  38.    - The script generates bigrams from the input name and counts their frequency in the dictionary of English words.
  39.  
  40. Additional Notes:
  41.    - A dictionary.txt file is generated only if none found & saved to the current working directory when this script is run.
  42.    - This script offers an options menu to either run a demo with the name 'Voldemort' or input a custom name.
  43.    - The script then generates bigrams from the input name and counts their frequency in a dictionary of English words.
  44. """
  45.  
  46. import re
  47. import sys
  48. import os
  49. import requests
  50. from collections import defaultdict
  51. from itertools import permutations
  52. from typing import Set, DefaultDict
  53.  
  54. def download_dictionary(url: str, file_name: str):
  55.     """
  56.    Download a dictionary file from a URL and save it alphabetically if it doesn't exist.
  57.    
  58.    Parameters:
  59.        url (str): The URL from which to download the dictionary file.
  60.        file_name (str): The name to save the downloaded file as.
  61.    
  62.    Raises:
  63.        requests.RequestException: If an error occurs during the HTTP request.
  64.    """
  65.     if not os.path.exists(file_name):
  66.         print("\nDownloading dictionary file from:\n" + "{}".format(url))
  67.         try:
  68.             response = requests.get(url)
  69.             response.raise_for_status()  # Check if the request was successful
  70.            
  71.             # Split the content by lines, strip whitespace, and sort alphabetically
  72.             sorted_content = sorted(line.strip() for line in response.text.strip().split('\n'))
  73.            
  74.             # Save the sorted content to the file
  75.             with open(file_name, 'w', encoding='utf-8') as f:
  76.                 f.write('\n'.join(sorted_content))
  77.            
  78.         except requests.RequestException as e:
  79.             print("\nError downloading dictionary from {}: {}".format(url, e))
  80.             sys.exit(1)
  81.         else:
  82.             print("\nDictionary file downloaded and saved alphabetically as: '{}'.".format(file_name))
  83.     else:
  84.         print("\nDictionary file '{}' already exists. Skipping download.".format(file_name))
  85.  
  86. def load_dictionary(file: str) -> Set[str]:
  87.     """
  88.    Open a text file & turn contents into a set of lowercase strings.
  89.    
  90.    Parameters:
  91.        file (str): The name of the file to open.
  92.    
  93.    Returns:
  94.        Set[str]: A set of lowercase strings containing the words from the file.
  95.    """
  96.     print("\nLoading dictionary file...")
  97.     try:
  98.         with open(file, encoding='utf-8') as in_file:
  99.             loaded_txt = in_file.read().strip().split('\n')
  100.             loaded_set = {x.lower() for x in loaded_txt}
  101.             return loaded_set
  102.     except IOError as e:
  103.         print("{}\nError opening {}. Terminating program.".format(e, file))
  104.         sys.exit(1)
  105.  
  106. def ensure_word_in_dictionary(word: str, dict_file: str, words: Set[str]) -> Set[str]:
  107.     """
  108.    Ensure a specific word is in the dictionary, adding it if necessary.
  109.    
  110.    Parameters:
  111.        word (str): The word to ensure is in the dictionary.
  112.        dict_file (str): The name of the dictionary file.
  113.        words (Set[str]): The set of words loaded from the dictionary.
  114.    
  115.    Returns:
  116.        Set[str]: The updated set of words including the specified word.
  117.    """
  118.     if word.lower() not in words:
  119.         words.add(word.lower())
  120.         sorted_words = sorted(words)
  121.         with open(dict_file, 'w', encoding='utf-8') as out_file:
  122.             out_file.write('\n'.join(sorted_words))
  123.         print(f"Added '{word}' to the dictionary and sorted it alphabetically.")
  124.         print("\nGathering Bigrams From Input Name...")
  125.     return words
  126.  
  127. def generate_bigrams(name: str) -> Set[str]:
  128.     """
  129.    Generate unique letter pairs (bigrams) from the given name.
  130.    
  131.    Parameters:
  132.        name (str): The name to generate bigrams from.
  133.    
  134.    Returns:
  135.        Set[str]: A set of unique bigrams.
  136.    """
  137.     name = name.lower()
  138.     bigrams = set()
  139.     perms = {''.join(i) for i in permutations(name)}
  140.     for perm in perms:
  141.         for i in range(0, len(perm) - 1):
  142.             bigrams.add(perm[i] + perm[i + 1])
  143.     return bigrams
  144.  
  145. def count_bigram_frequency(bigrams: Set[str], words: Set[str]) -> DefaultDict[str, int]:
  146.     """
  147.    Count the frequency of bigrams in a list of words.
  148.    
  149.    Parameters:
  150.        bigrams (Set[str]): A set of bigrams to count.
  151.        words (Set[str]): A set of words to search for bigrams.
  152.    
  153.    Returns:
  154.        DefaultDict[str, int]: A dictionary with bigram frequencies.
  155.    """
  156.     mapped: DefaultDict[str, int] = defaultdict(int)
  157.     for word in words:
  158.         word = word.lower()
  159.         for bigram in bigrams:
  160.             if re.search(bigram, word):
  161.                 mapped[bigram] += 1
  162.     return mapped
  163.  
  164. def main():
  165.     """
  166.    Main function to run the script with an options menu.
  167.    """
  168.     # Define dictionary URL and file name
  169.     dictionary_url = "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
  170.     dictionary_file = "dictionary.txt"
  171.  
  172.     # Download dictionary if it doesn't exist
  173.     download_dictionary(dictionary_url, dictionary_file)
  174.  
  175.     # Load dictionary into a set
  176.     dictionary_set = load_dictionary(dictionary_file)
  177.  
  178.     print("\n:: BIGRAMS OPTION MENU ::\n")
  179.     print("1. Run demo with 'Voldemort'")
  180.     print("2. Input your own name")
  181.  
  182.     choice = input("\nEnter your choice (1 or 2): ")
  183.  
  184.     if choice == '1':
  185.         name = 'Voldemort'
  186.         dictionary_set = ensure_word_in_dictionary(name, dictionary_file, dictionary_set)
  187.     elif choice == '2':
  188.         name = input("\nEnter your name: ").strip()
  189.         dictionary_set = ensure_word_in_dictionary(name, dictionary_file, dictionary_set)
  190.     else:
  191.         print("Invalid choice. Exiting.")
  192.         sys.exit(1)
  193.        
  194.     # Generate Bigrams
  195.     bigrams_set = generate_bigrams(name)
  196.     print("\nGenerated bigrams:\n")
  197.     print(*bigrams_set, sep='\n')
  198.     print("\nNumber of bigrams = {}\n".format(len(bigrams_set)))
  199.    
  200.     # Get Bigram Frequency
  201.     print("Counting Bigram Frequency...")
  202.     print("\nThis may take some time depending on the length of the input.\n\n")
  203.     bigram_frequencies = count_bigram_frequency(bigrams_set, dictionary_set)
  204.     print("\nBigram frequency count:\n")
  205.     for k in bigram_frequencies:
  206.         print("{} {}".format(k, bigram_frequencies[k]))
  207.  
  208. if __name__ == "__main__":
  209.     main()
  210.     print("\nExiting Program...   GoodBye!\n")
  211.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement