pagerank.py

import os
import random
import re
import sys
import numpy as np

DAMPING = 0.85
SAMPLES = 10000
TOLERANCE = 1e-3  # Error tolerance = ±0.001 when comparing sample and iterate results


def main():
    if len(sys.argv) != 2:
        sys.exit("Usage: python pagerank.py corpus")
    corpus = crawl(sys.argv[1])
    ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
    print(f"PageRank Results from Sampling (n = {SAMPLES})")
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")
    ranks = iterate_pagerank(corpus, DAMPING)
    print(f"PageRank Results from Iteration")
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")


def crawl(directory):
    """
    Parse a directory of HTML pages and check for links to other pages.
    Return a dictionary where each key is a page, and values are
    a list of all other pages in the corpus that are linked to by the page.
    """
    pages = dict()

    # Extract all links from HTML files
    for filename in os.listdir(directory):
        if not filename.endswith(".html"):
            continue
        with open(os.path.join(directory, filename)) as f:
            contents = f.read()
            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
            pages[filename] = set(links) - {filename}

    # Only include links to other pages in the corpus
    for filename in pages:
        pages[filename] = set(
            link for link in pages[filename]
            if link in pages
        )

    return pages


def transition_model(corpus, page, damping_factor):
    """
    Return a probability distribution over which page to visit next,
    given a current page.

    With probability `damping_factor`, choose a link at random
    linked to by `page`. With probability `1 - damping_factor`, choose
    a link at random chosen from all pages in the corpus.
    """
    n = len(corpus[page])  # number of out going links of page
    N = len(corpus)  # number of pages in corpus
    # transiton model for page

    for k, v in corpus.items():
        if k == page:  # we are only intetrested in transition model for page page!
            if not v:  # if a page has no link we pretend it has links to all pages in the corpus
                tm = {k: 1/N for k in corpus.keys()}

            else:
                # probability to choose any single page
                p = (1-damping_factor)/N
                tm = {k: p for k in corpus.keys()}
                for link in v:
                    # we add the probability of choosing link from k page
                    tm[link] += damping_factor/n
    return tm


def sample_pagerank(corpus, damping_factor, nb_samples):
    """
    Return PageRank values for each page by sampling `n` pages
    according to transition model, starting with a page at random.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """

    generated_samples = []
    starting_page = random.choice(list(corpus.keys()))

    generate_samples(corpus, damping_factor, nb_samples,
                     generated_samples, starting_page)
    return compute_probabilities(corpus, nb_samples, generated_samples)


def iterate_pagerank(corpus, damping_factor):
    """
    Return PageRank values for each page by iteratively updating
    PageRank values until convergence.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """

    N = len(corpus)  # number of pages in corpus
    p = (1-damping_factor)/N  # probability to choose any single page
    # transiton model for page
    # initial pr value : all pages have same probability to be chosen
    initial = {k: 1/N for k in corpus.keys()}
    next = calculate_pr(corpus, damping_factor, initial, N)

    while has_not_converged(initial, next):
        initial = next
        # calculate new pr values
        next = calculate_pr_fast(corpus, damping_factor, initial, N)

       # next = calculate_pr_power_iteration(corpus, damping_factor, initial, N)

    return next


################## HELPER FUNCTIONS ####################


def generate_trans_models(corpus, damping_factor):
    trans_mod = {}  # dictionary for transition models of corpus pages
    for page in corpus:
        tmp = transition_model(corpus, page, damping_factor)
        trans_mod[page] = tmp
    return trans_mod


def generate_samples(corpus, damping_factor, nb_samples, generated_samples, page):

    tm = generate_trans_models(corpus, damping_factor)

    for i in range(nb_samples):  # compute n samples
        # returns a random page from list of keys according to
        # probability distribution of transition model for page
        kys = list(tm[page].keys())
        vals = list(tm[page].values())
        next_page = random.choices(kys, vals, k=1).pop()
        generated_samples.append(next_page)
        page = next_page


def compute_probabilities(corpus, n, samples):
    pr = {}
    for k in corpus.keys():
        n_k = samples.count(k)
        pr[k] = n_k/n
    return pr


def has_not_converged(previous, current):
    res = []
    for page in current:
        if np.abs(current[page]-previous[page]) > TOLERANCE:
            res.append(1)
        else:
            res.append(0)

    return any(res) == 1


def calculate_pr(corpus, damping_factor, previous_pr, N):
    # first terme of formula
    next_pr = {k: (1-damping_factor)/N for k in corpus.keys()}

    for page, links in corpus.items():
        if links:
            for key in corpus.keys():
                if key in links:  # we add that link
                    next_pr[key] += damping_factor * \
                        previous_pr[page]/len(links)

        else:  # page has no outgoing links.
            # We can pretend it has a link to all other N pages including itself.
            for key in corpus.keys():
                # we add an extra and artificial incoming link to all N pages
                next_pr[key] += damping_factor * previous_pr[page] / N
    return next_pr


# CS50 Duck : you're not checking every possible pair of pages, but rather,
# for each page, you're only looking at its outgoing links.
# This process has a time complexity of O(N * M), where N is the number of pages
# and M is the average number of links per page. If M is much smaller than N,
# this can be significantly faster than a quadratic time complexity.
def calculate_pr_fast(corpus, damping_factor, previous_pr, N):
    page_links_outgoing = precompute_outgoing_links(corpus, N)
    page_links_incoming = precompute_incoming_links(corpus)

    # first terme of formula
    next_pr = {k: (1-damping_factor)/N for k in corpus.keys()}
    # second terme of formula
    for page in corpus.keys():
        for l in page_links_incoming[page]:
            next_pr[page]+= damping_factor * previous_pr[l]/page_links_outgoing[l]

    return next_pr

def precompute_incoming_links(corpus):
    page_links_incoming = {k: set() for k in corpus.keys()}
    for page, links in corpus.items():
        # Initialize an empty set for that page in the dictionary
        if links:
            for l in links:  # For each link on that page:
                # Add the page to the set of the linked page in the dictionary.
                page_links_incoming[l].add(page)
        else: # page has no links so a link to every page including itself
            for key in corpus.keys(): # if a page has no links,
                # we can pretend it has links to all pages in the corpus, including itself.)
                page_links_incoming[key].add(page)
    return page_links_incoming

def precompute_outgoing_links(corpus, N):
    page_links_outgoing = {}
    # Precompute number of outgoing links from each page
    for p in corpus.keys():
        if corpus[p]:
            page_links_outgoing[p] = len(corpus[p])
        # if a page has no links,
        # we can pretend it has links to all pages in the corpus, including itself.)
        else:
            page_links_outgoing[p] = N
    return page_links_outgoing


if __name__ == "__main__":
    main()