Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- import doctest
- """
- This time I'm introducing a new testing tool. Instead of using asserts, I now write code directly into each function's docstring.
- Lines that start with >>> represent the code I want to run. The following lines without >>> mean the outputs I would expect
- if I were to run these code in the shell. This is called a "doctest."
- In the final line of this code file, I run `doctest.testmod()` which triggers all of the tests.
- You will be able to see the result of all functions' tests. Even if assertions for some functions fail,
- assertions for other functions are still run. So, doctest is a very flexible tool for testing code.
- ---
- This homework builds directly on the previous one.
- Read the docstring of each function here and copy the correct function body from your previous homework.
- You might have to modify a bit your code to fit the new function description.
- I've already done a few functions for you. You should study the solutions.
- """
- # already done for you
- def get_text_tf(text):
- """
- Arguments:
- text - A string representing a cleaned text.
- Returns:
- dict from str -> float.
- Each word is mapped to the fraction of times it appears in the text.
- There are 12 words in this example, so we must divide by 12.
- >>> example = 'hi my name is my name is gunn gunn hi gunn yo'
- >>> get_text_tf(example) == {'hi': 2/12, 'my': 2/12, 'name': 2/12, 'is': 2/12, 'gunn': 3/12, 'yo': 1/12}
- True
- """
- words = text.split()
- cnt = {}
- for word in words:
- if word in cnt:
- cnt[word] += 1
- else:
- cnt[word] = 1
- # instead of returning `cnt` directly, i constructed a new dictionary using list comprehension syntax
- # so that for each (word, freq) key-value pair in `cnt`,
- # the new dictionary has (word, fraction) key-value pair where fraction is freq/len(words)
- # (alternatively you could just add +1/len(words) every time you encounter a word,
- # but that might result in accumulation of numerical imprecision. dividing at the end is much better.)
- return {word: freq/len(words) for word, freq in cnt.items()}
- # already done for you
- def get_text_words(text):
- """
- Arguments:
- text - A string representing the cleaned text.
- Returns:
- set of str containing all words that appears in the text.
- >>> example = 'hi my name is my name is gunn gunn hi gunn yo'
- >>> get_text_words(example) == {'hi', 'is', 'yo', 'my', 'gunn', 'name'}
- True
- """
- # this is still the same
- return set(text.split())
- # your homework
- def get_corpus_words(corpus):
- """
- Arguments:
- corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
- Returns:
- set of str containing all words that appears in any document.
- >>> corpus = {
- ... 'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
- ... 'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
- ... 'thai.txt': 'hi i am thai'
- ... }
- >>> get_corpus_words(corpus) == {
- ... 'vaccine', 'thai', 'name', 'me', 'i', 'cant', 'yo', 'give', 'am',
- ... 'hi', 'my', 'gunn', 'why', 'want', 'pfizer', 'is', 'government'
- ... }
- True
- """
- # you have to change this function because corpus is now not a list.
- # recall that you can loop over corpus.items() (like `enumerate`), corpus.keys() or corpus.values().
- # therefore, you actually need to change just one or two lines from previous code.
- pass
- # your homework
- def get_corpus_df(corpus):
- """
- Arguments:
- corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
- Returns:
- dict: str -> float. Key is word and value is the fraction of documents which contain that word.
- >>> corpus = {
- ... 'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
- ... 'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
- ... 'thai.txt': 'hi i am thai'
- ... }
- >>> get_corpus_df(corpus) == {
- ... 'hi': 3/3, 'pfizer': 1/3, 'want': 1/3, 'name': 1/3, 'i': 2/3,
- ... 'why': 1/3, 'give': 1/3, 'me': 1/3, 'thai': 2/3, 'is': 1/3, 'gunn': 1/3, 'cant': 1/3,
- ... 'government': 1/3, 'my': 1/3, 'am': 1/3, 'vaccine': 1/3, 'yo': 1/3
- ... }
- True
- """
- # this will require the same change as in `get_corpus_words`. should be very simple.
- pass
- def get_tf_table(corpus):
- """
- Arguments:
- corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
- Returns:
- dict: str -> (dict: str -> float).
- This is a nested dictionary. The outer dictionary maps from file name to inner dictionary.
- The inner dictionary contains the dictionary obtained by calling `get_text_tf` on that file's contents.
- This basically allows us to compute given any document D and word w,
- tf(D, w): the term frequency of w in D (the fraction of times that w appears in D).
- (See the doctest for examples on how you access tf_table.)
- Note that this is different from df which only takes a word as an input, because document frequency of
- a word is a property of the entire corpus, not a single document.
- df(w): the fraction of documents in the corpus that contains the word w.
- >>> minicorpus = {
- ... 'a.txt': 'hi hi gunn',
- ... 'b.txt': 'gunn yo hi meh'
- ... }
- >>> tf_table = get_tf_table(minicorpus)
- >>> tf_table['a.txt']['hi'] == 2/3
- True
- >>> tf_table['b.txt']['yo'] == 1/4
- True
- >>> tf_table == {
- ... 'a.txt': {'hi': 2/3, 'gunn': 1/3},
- ... 'b.txt': {'gunn': 1/4, 'yo': 1/4, 'hi': 1/4, 'meh': 1/4}
- ... }
- True
- """
- # i recommend starting with an empty tf_table = {}
- # and then as you look through each filename, text in corpus.items()
- # you set tf_table[filename] to the result you get from calling get_text_tf on that text.
- pass
- def get_tfidf_table(corpus):
- """
- Arguments:
- corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
- Returns:
- dict: str -> (dict: str -> float).
- This is a nested dictionary. The outer dictionary maps from file name to inner dictionary.
- The inner dictionary contains the dictionary that maps from each of that document's word to its tf-idf value.
- tf-idf value of a word `w` in a document `D` is computed as:
- tf-idf(w, D) = tf(w, D) * log(1 / df(w))
- In Python you can apply logarithm function via math.log(x).
- >>> minicorpus = {
- ... 'a.txt': 'hi hi gunn',
- ... 'b.txt': 'gunn yo hi meh'
- ... }
- >>> tfidf_table = get_tfidf_table(minicorpus)
- >>> tfidf_table['b.txt']['yo']
- 0.17328679513998632
- >>> tfidf_table == {
- ... 'a.txt': {'hi': 0.0, 'gunn': 0.0},
- ... 'b.txt': {'gunn': 0.0, 'yo': 0.17328679513998632, 'hi': 0.0, 'meh': 0.17328679513998632}
- ... }
- True
- """
- tf_table = get_tf_table(corpus)
- df = get_corpus_df(corpus)
- tfidf_table = {}
- # fill in your code here
- return tfidf_table
- # already done for you
- def get_best_words_table(corpus, max=None):
- """
- Arguments:
- corpus - dict: str -> str. Key is document file name and value is the document's cleaned contents.
- max - int. The number of important words we want to take. If `max` is None, all words are taken.
- Returns:
- dict: str -> list of str.
- The dictionary maps from filename to the list of at most `max` important words, sorted in decreasing order of importance.
- >>> corpus = {
- ... 'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
- ... 'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
- ... 'thai.txt': 'hi i am thai'
- ... }
- >>> get_best_words_table(corpus, 3) == {
- ... 'gunn.txt': ['gunn', 'my', 'name'],
- ... 'pfizer.txt': ['want', 'pfizer', 'vaccine'],
- ... 'thai.txt': ['am', 'i', 'thai']
- ... }
- True
- """
- # you don't have to understand this code yet. i will explain in class.
- tfidf_table = get_tfidf_table(corpus)
- best_table = {}
- for filename, tfidf in tfidf_table.items():
- best_table[filename] = list(sorted(tfidf.keys(), key=lambda word: tfidf[word], reverse=True)[:max])
- return best_table
- # variables defined for you so you can test with the shell easily
- example = 'hi my name is my name is gunn gunn hi gunn yo'
- minicorpus = {
- 'a.txt': 'hi hi gunn',
- 'b.txt': 'gunn yo hi meh'
- }
- corpus = {
- 'gunn.txt': 'hi my name is my name is gunn gunn hi gunn yo',
- 'pfizer.txt': 'hi i want pfizer vaccine why cant thai government give me',
- 'thai.txt': 'hi i am thai'
- }
- # perform all tests
- doctest.testmod()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement