Advertisement
jules0707

pagerank.py

Dec 12th, 2023 (edited)
7
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.30 KB | None | 0 0
  1. import os
  2. import random
  3. import re
  4. import sys
  5. import numpy as np
  6.  
  7. DAMPING = 0.85
  8. SAMPLES = 10000
  9. TOLERANCE = 1e-3  # Error tolerance = ±0.001 when comparing sample and iterate results
  10.  
  11.  
  12. def main():
  13.     if len(sys.argv) != 2:
  14.         sys.exit("Usage: python pagerank.py corpus")
  15.     corpus = crawl(sys.argv[1])
  16.     ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
  17.     print(f"PageRank Results from Sampling (n = {SAMPLES})")
  18.     for page in sorted(ranks):
  19.         print(f"  {page}: {ranks[page]:.4f}")
  20.     ranks = iterate_pagerank(corpus, DAMPING)
  21.     print(f"PageRank Results from Iteration")
  22.     for page in sorted(ranks):
  23.         print(f"  {page}: {ranks[page]:.4f}")
  24.  
  25.  
  26. def crawl(directory):
  27.     """
  28.    Parse a directory of HTML pages and check for links to other pages.
  29.    Return a dictionary where each key is a page, and values are
  30.    a list of all other pages in the corpus that are linked to by the page.
  31.    """
  32.     pages = dict()
  33.  
  34.     # Extract all links from HTML files
  35.     for filename in os.listdir(directory):
  36.         if not filename.endswith(".html"):
  37.             continue
  38.         with open(os.path.join(directory, filename)) as f:
  39.             contents = f.read()
  40.             links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
  41.             pages[filename] = set(links) - {filename}
  42.  
  43.     # Only include links to other pages in the corpus
  44.     for filename in pages:
  45.         pages[filename] = set(
  46.             link for link in pages[filename]
  47.             if link in pages
  48.         )
  49.  
  50.     return pages
  51.  
  52.  
  53. def transition_model(corpus, page, damping_factor):
  54.     """
  55.    Return a probability distribution over which page to visit next,
  56.    given a current page.
  57.  
  58.    With probability `damping_factor`, choose a link at random
  59.    linked to by `page`. With probability `1 - damping_factor`, choose
  60.    a link at random chosen from all pages in the corpus.
  61.    """
  62.     n = len(corpus[page])  # number of out going links of page
  63.     N = len(corpus)  # number of pages in corpus
  64.     # transiton model for page
  65.  
  66.     for k, v in corpus.items():
  67.         if k == page:  # we are only intetrested in transition model for page page!
  68.             if not v:  # if a page has no link we pretend it has links to all pages in the corpus
  69.                 tm = {k: 1/N for k in corpus.keys()}
  70.  
  71.             else:
  72.                 # probability to choose any single page
  73.                 p = (1-damping_factor)/N
  74.                 tm = {k: p for k in corpus.keys()}
  75.                 for link in v:
  76.                     # we add the probability of choosing link from k page
  77.                     tm[link] += damping_factor/n
  78.     return tm
  79.  
  80.  
  81. def sample_pagerank(corpus, damping_factor, nb_samples):
  82.     """
  83.    Return PageRank values for each page by sampling `n` pages
  84.    according to transition model, starting with a page at random.
  85.  
  86.    Return a dictionary where keys are page names, and values are
  87.    their estimated PageRank value (a value between 0 and 1). All
  88.    PageRank values should sum to 1.
  89.    """
  90.  
  91.     generated_samples = []
  92.     starting_page = random.choice(list(corpus.keys()))
  93.  
  94.     generate_samples(corpus, damping_factor, nb_samples,
  95.                      generated_samples, starting_page)
  96.     return compute_probabilities(corpus, nb_samples, generated_samples)
  97.  
  98.  
  99. def iterate_pagerank(corpus, damping_factor):
  100.     """
  101.    Return PageRank values for each page by iteratively updating
  102.    PageRank values until convergence.
  103.  
  104.    Return a dictionary where keys are page names, and values are
  105.    their estimated PageRank value (a value between 0 and 1). All
  106.    PageRank values should sum to 1.
  107.    """
  108.  
  109.     N = len(corpus)  # number of pages in corpus
  110.     p = (1-damping_factor)/N  # probability to choose any single page
  111.     # transiton model for page
  112.     # initial pr value : all pages have same probability to be chosen
  113.     initial = {k: 1/N for k in corpus.keys()}
  114.     next = calculate_pr(corpus, damping_factor, initial, N)
  115.  
  116.     while has_not_converged(initial, next):
  117.         initial = next
  118.         # calculate new pr values
  119.         next = calculate_pr_fast(corpus, damping_factor, initial, N)
  120.  
  121.        # next = calculate_pr_power_iteration(corpus, damping_factor, initial, N)
  122.  
  123.     return next
  124.  
  125.  
  126. ################## HELPER FUNCTIONS ####################
  127.  
  128.  
  129. def generate_trans_models(corpus, damping_factor):
  130.     trans_mod = {}  # dictionary for transition models of corpus pages
  131.     for page in corpus:
  132.         tmp = transition_model(corpus, page, damping_factor)
  133.         trans_mod[page] = tmp
  134.     return trans_mod
  135.  
  136.  
  137. def generate_samples(corpus, damping_factor, nb_samples, generated_samples, page):
  138.  
  139.     tm = generate_trans_models(corpus, damping_factor)
  140.  
  141.     for i in range(nb_samples):  # compute n samples
  142.         # returns a random page from list of keys according to
  143.         # probability distribution of transition model for page
  144.         kys = list(tm[page].keys())
  145.         vals = list(tm[page].values())
  146.         next_page = random.choices(kys, vals, k=1).pop()
  147.         generated_samples.append(next_page)
  148.         page = next_page
  149.  
  150.  
  151. def compute_probabilities(corpus, n, samples):
  152.     pr = {}
  153.     for k in corpus.keys():
  154.         n_k = samples.count(k)
  155.         pr[k] = n_k/n
  156.     return pr
  157.  
  158.  
  159. def has_not_converged(previous, current):
  160.     res = []
  161.     for page in current:
  162.         if np.abs(current[page]-previous[page]) > TOLERANCE:
  163.             res.append(1)
  164.         else:
  165.             res.append(0)
  166.  
  167.     return any(res) == 1
  168.  
  169.  
  170. def calculate_pr(corpus, damping_factor, previous_pr, N):
  171.     # first terme of formula
  172.     next_pr = {k: (1-damping_factor)/N for k in corpus.keys()}
  173.  
  174.     for page, links in corpus.items():
  175.         if links:
  176.             for key in corpus.keys():
  177.                 if key in links:  # we add that link
  178.                     next_pr[key] += damping_factor * \
  179.                         previous_pr[page]/len(links)
  180.  
  181.         else:  # page has no outgoing links.
  182.             # We can pretend it has a link to all other N pages including itself.
  183.             for key in corpus.keys():
  184.                 # we add an extra and artificial incoming link to all N pages
  185.                 next_pr[key] += damping_factor * previous_pr[page] / N
  186.     return next_pr
  187.  
  188.  
  189. # CS50 Duck : you're not checking every possible pair of pages, but rather,
  190. # for each page, you're only looking at its outgoing links.
  191. # This process has a time complexity of O(N * M), where N is the number of pages
  192. # and M is the average number of links per page. If M is much smaller than N,
  193. # this can be significantly faster than a quadratic time complexity.
  194. def calculate_pr_fast(corpus, damping_factor, previous_pr, N):
  195.     page_links_outgoing = precompute_outgoing_links(corpus, N)
  196.     page_links_incoming = precompute_incoming_links(corpus)
  197.                    
  198.     # first terme of formula
  199.     next_pr = {k: (1-damping_factor)/N for k in corpus.keys()}
  200.     # second terme of formula
  201.     for page in corpus.keys():
  202.         for l in page_links_incoming[page]:
  203.             next_pr[page]+= damping_factor * previous_pr[l]/page_links_outgoing[l]
  204.  
  205.     return next_pr
  206.  
  207. def precompute_incoming_links(corpus):
  208.     page_links_incoming = {k: set() for k in corpus.keys()}
  209.     for page, links in corpus.items():
  210.         # Initialize an empty set for that page in the dictionary
  211.         if links:    
  212.             for l in links:  # For each link on that page:
  213.                 # Add the page to the set of the linked page in the dictionary.
  214.                 page_links_incoming[l].add(page)
  215.         else: # page has no links so a link to every page including itself
  216.             for key in corpus.keys(): # if a page has no links,
  217.                 # we can pretend it has links to all pages in the corpus, including itself.)
  218.                 page_links_incoming[key].add(page)
  219.     return page_links_incoming
  220.  
  221. def precompute_outgoing_links(corpus, N):
  222.     page_links_outgoing = {}
  223.     # Precompute number of outgoing links from each page
  224.     for p in corpus.keys():
  225.         if corpus[p]:
  226.             page_links_outgoing[p] = len(corpus[p])
  227.         # if a page has no links,
  228.         # we can pretend it has links to all pages in the corpus, including itself.)
  229.         else:
  230.             page_links_outgoing[p] = N
  231.     return page_links_outgoing
  232.  
  233.  
  234. if __name__ == "__main__":
  235.     main()
  236.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement