Untitled

import math
import doctest

"""
This time I'm introducing a new testing tool. Instead of using asserts, I now write code directly into each function's docstring.
Lines that start with >>> represent the code I want to run. The following lines without >>> mean the outputs I would expect
if I were to run these code in the shell. This is called a "doctest."

In the final line of this code file, I run `doctest.testmod()` which triggers all of the tests.
You will be able to see the result of all functions' tests. Even if assertions for some functions fail,
assertions for other functions are still run. So, doctest is a very flexible tool for testing code.

---

This homework builds directly on the previous one.

Read the docstring of each function here and copy the correct function body from your previous homework.
You might have to modify a bit your code to fit the new function description.

I've already done a few functions for you. You should study the solutions.
"""

# already done for you
def get_text_tf(text):
    """
    Arguments:
        text - A string representing a cleaned text.
    Returns:
        dict from str -> float.
        Each word is mapped to the fraction of times it appears in the text.

    There are 12 words in this example, so we must divide by 12.
    >>> example = 'hi my name is my name is gunn gunn hi gunn yo'
    >>> get_text_tf(example) == {'hi': 2/12, 'my': 2/12, 'name': 2/12, 'is': 2/12, 'gunn': 3/12, 'yo': 1/12}
    True
    """

    words = text.split()
    cnt = {}
    for word in words:
        if word in cnt:
            cnt[word] += 1
        else:
            cnt[word] = 1

    # instead of returning `cnt` directly, i constructed a new dictionary using list comprehension syntax
    # so that for each (word, freq) key-value pair in `cnt`,
    # the new dictionary has (word, fraction) key-value pair where fraction is freq/len(words)

    # (alternatively you could just add +1/len(words) every time you encounter a word,
    # but that might result in accumulation of numerical imprecision. dividing at the end is much better.)

    return {word: freq/len(words) for word, freq in cnt.items()}

# already done for you
def get_text_words(text):
    """
    Arguments:
        text - A string representing the cleaned text.
    Returns:
        set of str containing all words that appears in the text.

    >>> example = 'hi my name is my name is gunn gunn hi gunn yo'
    >>> get_text_words(example) == {'hi', 'is', 'yo', 'my', 'gunn', 'name'}
    True
    """

    # this is still the same
    return set(text.split())

# your homework
def get_corpus_words(corpus):
    """
    Arguments:
        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
    Returns:
        set of str containing all words that appears in any document.

    >>> corpus = {
    ...     'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
    ...     'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
    ...     'thai.txt': 'hi i am thai'
    ... }
    >>> get_corpus_words(corpus) == {
    ...     'vaccine', 'thai', 'name', 'me', 'i', 'cant', 'yo', 'give', 'am',
    ...     'hi', 'my', 'gunn', 'why', 'want', 'pfizer', 'is', 'government'
    ... }
    True
    """

    # you have to change this function because corpus is now not a list.
    # recall that you can loop over corpus.items() (like `enumerate`), corpus.keys() or corpus.values().
    # therefore, you actually need to change just one or two lines from previous code.
    pass

# your homework
def get_corpus_df(corpus):
    """
    Arguments:
        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
    Returns:
        dict: str -> float. Key is word and value is the fraction of documents which contain that word.

    >>> corpus = {
    ...     'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
    ...     'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
    ...     'thai.txt': 'hi i am thai'
    ... }
    >>> get_corpus_df(corpus) == {
    ...     'hi': 3/3, 'pfizer': 1/3, 'want': 1/3, 'name': 1/3, 'i': 2/3,
    ...     'why': 1/3, 'give': 1/3, 'me': 1/3, 'thai': 2/3, 'is': 1/3, 'gunn': 1/3, 'cant': 1/3,
    ...     'government': 1/3, 'my': 1/3, 'am': 1/3, 'vaccine': 1/3, 'yo': 1/3
    ... }
    True
    """

    # this will require the same change as in `get_corpus_words`. should be very simple.
    pass

def get_tf_table(corpus):
    """
    Arguments:
        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
    Returns:
        dict: str -> (dict: str -> float).
        This is a nested dictionary. The outer dictionary maps from file name to inner dictionary.
        The inner dictionary contains the dictionary obtained by calling `get_text_tf` on that file's contents.

        This basically allows us to compute given any document D and word w,
        tf(D, w): the term frequency of w in D (the fraction of times that w appears in D).
        (See the doctest for examples on how you access tf_table.)

        Note that this is different from df which only takes a word as an input, because document frequency of
        a word is a property of the entire corpus, not a single document.
        df(w): the fraction of documents in the corpus that contains the word w.

    >>> minicorpus = {
    ...     'a.txt': 'hi hi gunn',
    ...     'b.txt': 'gunn yo hi meh'
    ... }
    >>> tf_table = get_tf_table(minicorpus)
    >>> tf_table['a.txt']['hi'] == 2/3
    True
    >>> tf_table['b.txt']['yo'] == 1/4
    True
    >>> tf_table == {
    ...     'a.txt': {'hi': 2/3, 'gunn': 1/3},
    ...     'b.txt': {'gunn': 1/4, 'yo': 1/4, 'hi': 1/4, 'meh': 1/4}
    ... }
    True
    """

    # i recommend starting with an empty tf_table = {}
    # and then as you look through each filename, text in corpus.items()
    # you set tf_table[filename] to the result you get from calling get_text_tf on that text.
    pass

def get_tfidf_table(corpus):
    """
    Arguments:
        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
    Returns:
        dict: str -> (dict: str -> float).
        This is a nested dictionary. The outer dictionary maps from file name to inner dictionary.
        The inner dictionary contains the dictionary that maps from each of that document's word to its tf-idf value.

        tf-idf value of a word `w` in a document `D` is computed as:
            tf-idf(w, D) = tf(w, D) * log(1 / df(w))
        In Python you can apply logarithm function via math.log(x).

    >>> minicorpus = {
    ...     'a.txt': 'hi hi gunn',
    ...     'b.txt': 'gunn yo hi meh'
    ... }
    >>> tfidf_table = get_tfidf_table(minicorpus)
    >>> tfidf_table['b.txt']['yo']
    0.17328679513998632
    >>> tfidf_table == {
    ...     'a.txt': {'hi': 0.0, 'gunn': 0.0},
    ...     'b.txt': {'gunn': 0.0, 'yo': 0.17328679513998632, 'hi': 0.0, 'meh': 0.17328679513998632}
    ... }
    True
    """

    tf_table = get_tf_table(corpus)
    df = get_corpus_df(corpus)

    tfidf_table = {}
    # fill in your code here

    return tfidf_table

# already done for you
def get_best_words_table(corpus, max=None):
    """
    Arguments:
        corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
        max - int. The number of important words we want to take. If `max` is None, all words are taken.
    Returns:
        dict: str -> list of str.
        The dictionary maps from filename to the list of at most `max` important words, sorted in decreasing order of importance.

    >>> corpus = {
    ...     'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
    ...     'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
    ...     'thai.txt': 'hi i am thai'
    ... }
    >>> get_best_words_table(corpus, 3) == {
    ...     'gunn.txt': ['gunn', 'my', 'name'],
    ...     'pfizer.txt': ['want', 'pfizer', 'vaccine'],
    ...     'thai.txt': ['am', 'i', 'thai']
    ... }
    True
    """

    # you don't have to understand this code yet. i will explain in class.
    tfidf_table = get_tfidf_table(corpus)
    best_table = {}
    for filename, tfidf in tfidf_table.items():
        best_table[filename] = list(sorted(tfidf.keys(), key=lambda word: tfidf[word], reverse=True)[:max])
    return best_table

# variables defined for you so you can test with the shell easily
example = 'hi my name is my name is gunn gunn hi gunn yo'
minicorpus = {
    'a.txt': 'hi hi gunn',
    'b.txt': 'gunn yo hi meh'
}
corpus = {
    'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
    'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
    'thai.txt': 'hi i am thai'
}

# perform all tests
doctest.testmod()