Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import random
- import re
- import sys
- import numpy as np
- DAMPING = 0.85
- SAMPLES = 10000
- TOLERANCE = 1e-3 # Error tolerance = ±0.001 when comparing sample and iterate results
- def main():
- if len(sys.argv) != 2:
- sys.exit("Usage: python pagerank.py corpus")
- corpus = crawl(sys.argv[1])
- ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
- print(f"PageRank Results from Sampling (n = {SAMPLES})")
- for page in sorted(ranks):
- print(f" {page}: {ranks[page]:.4f}")
- ranks = iterate_pagerank(corpus, DAMPING)
- print(f"PageRank Results from Iteration")
- for page in sorted(ranks):
- print(f" {page}: {ranks[page]:.4f}")
- def crawl(directory):
- """
- Parse a directory of HTML pages and check for links to other pages.
- Return a dictionary where each key is a page, and values are
- a list of all other pages in the corpus that are linked to by the page.
- """
- pages = dict()
- # Extract all links from HTML files
- for filename in os.listdir(directory):
- if not filename.endswith(".html"):
- continue
- with open(os.path.join(directory, filename)) as f:
- contents = f.read()
- links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
- pages[filename] = set(links) - {filename}
- # Only include links to other pages in the corpus
- for filename in pages:
- pages[filename] = set(
- link for link in pages[filename]
- if link in pages
- )
- return pages
- def transition_model(corpus, page, damping_factor):
- """
- Return a probability distribution over which page to visit next,
- given a current page.
- With probability `damping_factor`, choose a link at random
- linked to by `page`. With probability `1 - damping_factor`, choose
- a link at random chosen from all pages in the corpus.
- """
- n = len(corpus[page]) # number of out going links of page
- N = len(corpus) # number of pages in corpus
- # transiton model for page
- for k, v in corpus.items():
- if k == page: # we are only intetrested in transition model for page page!
- if not v: # if a page has no link we pretend it has links to all pages in the corpus
- tm = {k: 1/N for k in corpus.keys()}
- else:
- # probability to choose any single page
- p = (1-damping_factor)/N
- tm = {k: p for k in corpus.keys()}
- for link in v:
- # we add the probability of choosing link from k page
- tm[link] += damping_factor/n
- return tm
- def sample_pagerank(corpus, damping_factor, nb_samples):
- """
- Return PageRank values for each page by sampling `n` pages
- according to transition model, starting with a page at random.
- Return a dictionary where keys are page names, and values are
- their estimated PageRank value (a value between 0 and 1). All
- PageRank values should sum to 1.
- """
- generated_samples = []
- starting_page = random.choice(list(corpus.keys()))
- generate_samples(corpus, damping_factor, nb_samples,
- generated_samples, starting_page)
- return compute_probabilities(corpus, nb_samples, generated_samples)
- def iterate_pagerank(corpus, damping_factor):
- """
- Return PageRank values for each page by iteratively updating
- PageRank values until convergence.
- Return a dictionary where keys are page names, and values are
- their estimated PageRank value (a value between 0 and 1). All
- PageRank values should sum to 1.
- """
- N = len(corpus) # number of pages in corpus
- p = (1-damping_factor)/N # probability to choose any single page
- # transiton model for page
- # initial pr value : all pages have same probability to be chosen
- initial = {k: 1/N for k in corpus.keys()}
- next = calculate_pr(corpus, damping_factor, initial, N)
- while has_not_converged(initial, next):
- initial = next
- # calculate new pr values
- next = calculate_pr_fast(corpus, damping_factor, initial, N)
- # next = calculate_pr_power_iteration(corpus, damping_factor, initial, N)
- return next
- ################## HELPER FUNCTIONS ####################
- def generate_trans_models(corpus, damping_factor):
- trans_mod = {} # dictionary for transition models of corpus pages
- for page in corpus:
- tmp = transition_model(corpus, page, damping_factor)
- trans_mod[page] = tmp
- return trans_mod
- def generate_samples(corpus, damping_factor, nb_samples, generated_samples, page):
- tm = generate_trans_models(corpus, damping_factor)
- for i in range(nb_samples): # compute n samples
- # returns a random page from list of keys according to
- # probability distribution of transition model for page
- kys = list(tm[page].keys())
- vals = list(tm[page].values())
- next_page = random.choices(kys, vals, k=1).pop()
- generated_samples.append(next_page)
- page = next_page
- def compute_probabilities(corpus, n, samples):
- pr = {}
- for k in corpus.keys():
- n_k = samples.count(k)
- pr[k] = n_k/n
- return pr
- def has_not_converged(previous, current):
- res = []
- for page in current:
- if np.abs(current[page]-previous[page]) > TOLERANCE:
- res.append(1)
- else:
- res.append(0)
- return any(res) == 1
- def calculate_pr(corpus, damping_factor, previous_pr, N):
- # first terme of formula
- next_pr = {k: (1-damping_factor)/N for k in corpus.keys()}
- for page, links in corpus.items():
- if links:
- for key in corpus.keys():
- if key in links: # we add that link
- next_pr[key] += damping_factor * \
- previous_pr[page]/len(links)
- else: # page has no outgoing links.
- # We can pretend it has a link to all other N pages including itself.
- for key in corpus.keys():
- # we add an extra and artificial incoming link to all N pages
- next_pr[key] += damping_factor * previous_pr[page] / N
- return next_pr
- # CS50 Duck : you're not checking every possible pair of pages, but rather,
- # for each page, you're only looking at its outgoing links.
- # This process has a time complexity of O(N * M), where N is the number of pages
- # and M is the average number of links per page. If M is much smaller than N,
- # this can be significantly faster than a quadratic time complexity.
- def calculate_pr_fast(corpus, damping_factor, previous_pr, N):
- page_links_outgoing = precompute_outgoing_links(corpus, N)
- page_links_incoming = precompute_incoming_links(corpus)
- # first terme of formula
- next_pr = {k: (1-damping_factor)/N for k in corpus.keys()}
- # second terme of formula
- for page in corpus.keys():
- for l in page_links_incoming[page]:
- next_pr[page]+= damping_factor * previous_pr[l]/page_links_outgoing[l]
- return next_pr
- def precompute_incoming_links(corpus):
- page_links_incoming = {k: set() for k in corpus.keys()}
- for page, links in corpus.items():
- # Initialize an empty set for that page in the dictionary
- if links:
- for l in links: # For each link on that page:
- # Add the page to the set of the linked page in the dictionary.
- page_links_incoming[l].add(page)
- else: # page has no links so a link to every page including itself
- for key in corpus.keys(): # if a page has no links,
- # we can pretend it has links to all pages in the corpus, including itself.)
- page_links_incoming[key].add(page)
- return page_links_incoming
- def precompute_outgoing_links(corpus, N):
- page_links_outgoing = {}
- # Precompute number of outgoing links from each page
- for p in corpus.keys():
- if corpus[p]:
- page_links_outgoing[p] = len(corpus[p])
- # if a page has no links,
- # we can pretend it has links to all pages in the corpus, including itself.)
- else:
- page_links_outgoing[p] = N
- return page_links_outgoing
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement