Advertisement
AquaBlitz11

Untitled

Jul 14th, 2021
1,271
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.96 KB | None | 0 0
  1. import math
  2. import doctest
  3.  
  4. """
  5. This time I'm introducing a new testing tool. Instead of using asserts, I now write code directly into each function's docstring.
  6. Lines that start with >>> represent the code I want to run. The following lines without >>> mean the outputs I would expect
  7. if I were to run these code in the shell. This is called a "doctest."
  8.  
  9. In the final line of this code file, I run `doctest.testmod()` which triggers all of the tests.
  10. You will be able to see the result of all functions' tests. Even if assertions for some functions fail,
  11. assertions for other functions are still run. So, doctest is a very flexible tool for testing code.
  12.  
  13. ---
  14.  
  15. This homework builds directly on the previous one.
  16.  
  17. Read the docstring of each function here and copy the correct function body from your previous homework.
  18. You might have to modify a bit your code to fit the new function description.
  19.  
  20. I've already done a few functions for you. You should study the solutions.
  21. """
  22.  
  23. # already done for you
  24. def get_text_tf(text):
  25.     """
  26.    Arguments:
  27.        text - A string representing a cleaned text.
  28.    Returns:
  29.        dict from str -> float.
  30.        Each word is mapped to the fraction of times it appears in the text.
  31.  
  32.    There are 12 words in this example, so we must divide by 12.
  33.    >>> example = 'hi my name is my name is gunn gunn hi gunn yo'
  34.    >>> get_text_tf(example) == {'hi': 2/12, 'my': 2/12, 'name': 2/12, 'is': 2/12, 'gunn': 3/12, 'yo': 1/12}
  35.    True
  36.    """
  37.  
  38.     words = text.split()
  39.     cnt = {}
  40.     for word in words:
  41.         if word in cnt:
  42.             cnt[word] += 1
  43.         else:
  44.             cnt[word] = 1
  45.        
  46.     # instead of returning `cnt` directly, i constructed a new dictionary using list comprehension syntax
  47.     # so that for each (word, freq) key-value pair in `cnt`,
  48.     # the new dictionary has (word, fraction) key-value pair where fraction is freq/len(words)
  49.  
  50.     # (alternatively you could just add +1/len(words) every time you encounter a word,
  51.     # but that might result in accumulation of numerical imprecision. dividing at the end is much better.)
  52.  
  53.     return {word: freq/len(words) for word, freq in cnt.items()}
  54.  
  55. # already done for you
  56. def get_text_words(text):
  57.     """
  58.    Arguments:
  59.        text - A string representing the cleaned text.
  60.    Returns:
  61.        set of str containing all words that appears in the text.
  62.  
  63.    >>> example = 'hi my name is my name is gunn gunn hi gunn yo'
  64.    >>> get_text_words(example) == {'hi', 'is', 'yo', 'my', 'gunn', 'name'}
  65.    True
  66.    """
  67.  
  68.     # this is still the same
  69.     return set(text.split())
  70.  
  71. # your homework
  72. def get_corpus_words(corpus):
  73.     """
  74.    Arguments:
  75.        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
  76.    Returns:
  77.        set of str containing all words that appears in any document.
  78.  
  79.    >>> corpus = {
  80.    ...     'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
  81.    ...     'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
  82.    ...     'thai.txt': 'hi i am thai'
  83.    ... }
  84.    >>> get_corpus_words(corpus) == {
  85.    ...     'vaccine', 'thai', 'name', 'me', 'i', 'cant', 'yo', 'give', 'am',
  86.    ...     'hi', 'my', 'gunn', 'why', 'want', 'pfizer', 'is', 'government'
  87.    ... }
  88.    True
  89.    """
  90.  
  91.     # you have to change this function because corpus is now not a list.
  92.     # recall that you can loop over corpus.items() (like `enumerate`), corpus.keys() or corpus.values().
  93.     # therefore, you actually need to change just one or two lines from previous code.
  94.     pass
  95.  
  96. # your homework
  97. def get_corpus_df(corpus):
  98.     """
  99.    Arguments:
  100.        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
  101.    Returns:
  102.        dict: str -> float. Key is word and value is the fraction of documents which contain that word.
  103.  
  104.    >>> corpus = {
  105.    ...     'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
  106.    ...     'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
  107.    ...     'thai.txt': 'hi i am thai'
  108.    ... }
  109.    >>> get_corpus_df(corpus) == {
  110.    ...     'hi': 3/3, 'pfizer': 1/3, 'want': 1/3, 'name': 1/3, 'i': 2/3,
  111.    ...     'why': 1/3, 'give': 1/3, 'me': 1/3, 'thai': 2/3, 'is': 1/3, 'gunn': 1/3, 'cant': 1/3,
  112.    ...     'government': 1/3, 'my': 1/3, 'am': 1/3, 'vaccine': 1/3, 'yo': 1/3
  113.    ... }
  114.    True
  115.    """
  116.  
  117.     # this will require the same change as in `get_corpus_words`. should be very simple.
  118.     pass
  119.  
  120. def get_tf_table(corpus):
  121.     """
  122.    Arguments:
  123.        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
  124.    Returns:
  125.        dict: str -> (dict: str -> float).
  126.        This is a nested dictionary. The outer dictionary maps from file name to inner dictionary.
  127.        The inner dictionary contains the dictionary obtained by calling `get_text_tf` on that file's contents.
  128.  
  129.        This basically allows us to compute given any document D and word w,
  130.        tf(D, w): the term frequency of w in D (the fraction of times that w appears in D).
  131.        (See the doctest for examples on how you access tf_table.)
  132.  
  133.        Note that this is different from df which only takes a word as an input, because document frequency of
  134.        a word is a property of the entire corpus, not a single document.
  135.        df(w): the fraction of documents in the corpus that contains the word w.
  136.  
  137.    >>> minicorpus = {
  138.    ...     'a.txt': 'hi hi gunn',
  139.    ...     'b.txt': 'gunn yo hi meh'
  140.    ... }
  141.    >>> tf_table = get_tf_table(minicorpus)
  142.    >>> tf_table['a.txt']['hi'] == 2/3
  143.    True
  144.    >>> tf_table['b.txt']['yo'] == 1/4
  145.    True
  146.    >>> tf_table == {
  147.    ...     'a.txt': {'hi': 2/3, 'gunn': 1/3},
  148.    ...     'b.txt': {'gunn': 1/4, 'yo': 1/4, 'hi': 1/4, 'meh': 1/4}
  149.    ... }
  150.    True
  151.    """
  152.  
  153.     # i recommend starting with an empty tf_table = {}
  154.     # and then as you look through each filename, text in corpus.items()
  155.     # you set tf_table[filename] to the result you get from calling get_text_tf on that text.
  156.     pass
  157.  
  158. def get_tfidf_table(corpus):
  159.     """
  160.    Arguments:
  161.        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
  162.    Returns:
  163.        dict: str -> (dict: str -> float).
  164.        This is a nested dictionary. The outer dictionary maps from file name to inner dictionary.
  165.        The inner dictionary contains the dictionary that maps from each of that document's word to its tf-idf value.
  166.  
  167.        tf-idf value of a word `w` in a document `D` is computed as:
  168.            tf-idf(w, D) = tf(w, D) * log(1 / df(w))
  169.        In Python you can apply logarithm function via math.log(x).
  170.  
  171.    >>> minicorpus = {
  172.    ...     'a.txt': 'hi hi gunn',
  173.    ...     'b.txt': 'gunn yo hi meh'
  174.    ... }
  175.    >>> tfidf_table = get_tfidf_table(minicorpus)
  176.    >>> tfidf_table['b.txt']['yo']
  177.    0.17328679513998632
  178.    >>> tfidf_table == {
  179.    ...     'a.txt': {'hi': 0.0, 'gunn': 0.0},
  180.    ...     'b.txt': {'gunn': 0.0, 'yo': 0.17328679513998632, 'hi': 0.0, 'meh': 0.17328679513998632}
  181.    ... }
  182.    True
  183.    """
  184.  
  185.     tf_table = get_tf_table(corpus)
  186.     df = get_corpus_df(corpus)
  187.  
  188.     tfidf_table = {}
  189.     # fill in your code here
  190.    
  191.     return tfidf_table
  192.  
  193. # already done for you
  194. def get_best_words_table(corpus, max=None):
  195.     """
  196.    Arguments:
  197.        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
  198.        max - int. The number of important words we want to take. If `max` is None, all words are taken.
  199.    Returns:
  200.        dict: str -> list of str.
  201.        The dictionary maps from filename to the list of at most `max` important words, sorted in decreasing order of importance.
  202.  
  203.    >>> corpus = {
  204.    ...     'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
  205.    ...     'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
  206.    ...     'thai.txt': 'hi i am thai'
  207.    ... }
  208.    >>> get_best_words_table(corpus, 3) == {
  209.    ...     'gunn.txt': ['gunn', 'my', 'name'],
  210.    ...     'pfizer.txt': ['want', 'pfizer', 'vaccine'],
  211.    ...     'thai.txt': ['am', 'i', 'thai']
  212.    ... }
  213.    True
  214.    """
  215.  
  216.     # you don't have to understand this code yet. i will explain in class.
  217.     tfidf_table = get_tfidf_table(corpus)
  218.     best_table = {}
  219.     for filename, tfidf in tfidf_table.items():
  220.         best_table[filename] = list(sorted(tfidf.keys(), key=lambda word: tfidf[word], reverse=True)[:max])
  221.     return best_table
  222.  
  223. # variables defined for you so you can test with the shell easily
  224. example = 'hi my name is my name is gunn gunn hi gunn yo'
  225. minicorpus = {
  226.     'a.txt': 'hi hi gunn',
  227.     'b.txt': 'gunn yo hi meh'
  228. }
  229. corpus = {
  230.     'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
  231.     'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
  232.     'thai.txt': 'hi i am thai'
  233. }
  234.  
  235. # perform all tests
  236. doctest.testmod()
  237.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement