heredity.py

import csv
import itertools
import sys
import numpy

PROBS = {

    # Unconditional probabilities for having gene
    "gene": {
        2: 0.01,
        1: 0.03,
        0: 0.96
    },

    "trait": {

        # Probability of trait given two copies of gene
        2: {
            True: 0.65,
            False: 0.35
        },

        # Probability of trait given one copy of gene
        1: {
            True: 0.56,
            False: 0.44
        },

        # Probability of trait given no gene
        0: {
            True: 0.01,
            False: 0.99
        }
    },

    # Mutation probability
    "mutation": 0.01
}


def main():

    # Check for proper usage
    if len(sys.argv) != 2:
        sys.exit("Usage: python heredity.py data.csv")
    people = load_data(sys.argv[1])

    # Keep track of gene and trait probabilities for each person
    probabilities = {
        person: {
            "gene": {
                2: 0,
                1: 0,
                0: 0
            },
            "trait": {
                True: 0,
                False: 0
            }
        }
        for person in people
    }

    # Loop over all sets of people who might have the trait
    names = set(people)
    for have_trait in powerset(names):

        # Check if current set of people violates known information
        fails_evidence = any(
            (people[person]["trait"] is not None and
             people[person]["trait"] != (person in have_trait))
            for person in names
        )
        if fails_evidence:
            continue

        # Loop over all sets of people who might have the gene
        for one_gene in powerset(names):
            for two_genes in powerset(names - one_gene):

                # Update probabilities with new joint probability
                p = joint_probability(people, one_gene, two_genes, have_trait)
                update(probabilities, one_gene, two_genes, have_trait, p)

    # Ensure probabilities sum to 1
    normalize(probabilities)

    # Print results
    for person in people:
        print(f"{person}:")
        for field in probabilities[person]:
            print(f"  {field.capitalize()}:")
            for value in probabilities[person][field]:
                p = probabilities[person][field][value]
                print(f"    {value}: {p:.4f}")


def load_data(filename):
    """
    Load gene and trait data from a file into a dictionary.
    File assumed to be a CSV containing fields name, mother, father, trait.
    mother, father must both be blank, or both be valid names in the CSV.
    trait should be 0 or 1 if trait is known, blank otherwise.
    """
    data = dict()
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = row["name"]
            data[name] = {
                "name": name,
                "mother": row["mother"] or None,
                "father": row["father"] or None,
                "trait": (True if row["trait"] == "1" else
                          False if row["trait"] == "0" else None)
            }
    return data


def powerset(s):
    """
    Return a list of all possible subsets of set s.
    """
    s = list(s)
    return [
        set(s) for s in itertools.chain.from_iterable(
            itertools.combinations(s, r) for r in range(len(s) + 1)
        )
    ]


def joint_probability(people, one, two, trait):
    """
    Compute and return a joint probability.

    The probability returned should be the probability that
        * everyone in set `one_gene` has one copy of the gene, and
        * everyone in set `two_genes` has two copies of the gene, and
        * everyone not in `one_gene` or `two_gene` does not have the gene, and
        * everyone in set `have_trait` has the trait, and
        * everyone not in set` have_trait` does not have the trait.
    """

    # joint probability #
    p = []

    # set of persons with no copy of the mutated gene
    zero = set(people) - one - two
    no_trait = set(people)-trait

    for person in one:
        if hasNoParents(people, person):
            p_one_gene = PROBS["gene"][1]
        else:
            parent_A = people[person]["father"]
            parent_B = people[person]["mother"]
            p_one_gene = prob_one_gene(
                parent_A, parent_B, zero, one, two)
        p.append(p_one_gene)

    for person in two:
        if hasNoParents(people, person):
            p_two_genes = PROBS["gene"][2]
        else:
            parent_A = people[person]["father"]
            parent_B = people[person]["mother"]
            p_two_genes = prob_two_genes(
                parent_A, parent_B, zero, one, two)
        p.append(p_two_genes)

    for person in zero:
        if hasNoParents(people, person):
            p_zero_gene = PROBS["gene"][0]
        else:
            parent_A = people[person]["father"]
            parent_B = people[person]["mother"]
            p_zero_gene = prob_zero_gene(
                parent_A, parent_B, zero, one, two)
        p.append(p_zero_gene)

    for person in trait:
        if person in one:
            p_trait = PROBS["trait"][1][True]
        elif person in two:
            p_trait = PROBS["trait"][2][True]
        else:
            p_trait = PROBS["trait"][0][True]
        p.append(p_trait)

    for person in no_trait:
        if person in one:
            p_no_trait = PROBS["trait"][1][False]
        elif person in two:
            p_no_trait = PROBS["trait"][2][False]
        else:
            p_no_trait = PROBS["trait"][0][False]
        p.append(p_no_trait)

    return numpy.prod(p)


def update(probabilities, one_gene, two_genes, have_trait, p):
    """
    Add to `probabilities` a new joint probability `p`.
    Each person should have their "gene" and "trait" distributions updated.
    Which value for each distribution is updated depends on whether
    the person is in `have_gene` and `have_trait`, respectively.
    """

    for person in probabilities.keys():
        if person in one_gene:
            probabilities[person]["gene"][1] += p
        elif person in two_genes:
            probabilities[person]["gene"][2] += p
        else:  # Person has zero gene
            probabilities[person]["gene"][0] += p

        if person in have_trait:
            probabilities[person]["trait"][True] += p
        else:
            probabilities[person]["trait"][False] += p


def normalize(probabilities):
    """
    Update `probabilities` such that each probability distribution
    is normalized (i.e., sums to 1, with relative proportions the same).
    """
    people = probabilities.keys()

    for person in people:
        s1 = sum(probabilities[person]["gene"].values())
        coef1 = 1/s1
        for (k1, value1) in probabilities[person]["gene"].items():
            new_value_1 = value1*coef1
            probabilities[person]['gene'][k1] = new_value_1

        s2 = sum(probabilities[person]["trait"].values())
        coef2 = 1/s2
        for (k2, value2) in probabilities[person]['trait'].items():
            new_value_2 = value2*coef2
            probabilities[person]['trait'][k2] = new_value_2


def hasNoParents(people, person):
    return people[person]["father"] is None


################     ONE GENE    ################################
# everyone in set `one_gene` has one copy of the gene
################     ONE GENE    ################################

def prob_one_gene(A, B, zero, one, two):

    if (A in zero) and (B in zero):
        p_one_gene = (
            1*1
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

    elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
        p_zero_one = []
        # CASE 1 - Both parent pass a non mutated gene
        # ---------and one mutation.
        p_zero_one.append(
            1*0.5
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        # CASE 2: One parent passes the mutated gene and the other not
        # ---------and no mutation.
        p_zero_one.append(
            1*0.5
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

        p_one_gene = numpy.sum(p_zero_one)

    elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
        # One parent passes the mutated gene and the other not
        #  --no mutation
        p_one_gene = (
            1*1
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

    elif ((A in one) and (B in one)):

        p_one_one = []
        # CASE 1 - Both parent pass a non mutated gene
        # ---------and one mutation

        # CASE 2: One parent passes the mutated gene and the other not
        # ---------and no mutation

        # CASE 3: Both parents pass on the mutated gene
        # --------and one mutation

        # CASE 1
        p_one_one.append(
            0.5*0.5
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        # CASE 2
        p_one_one.append(
            0.5*0.5
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

        # CASE 3
        p_one_one.append(
            0.5*0.5
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        p_one_gene = numpy.sum(p_one_one)

    elif ((A in one) and (B in two)) or ((A in two) and (B in one)):

        p_one_two = []  # CASE 1 OR CASE 2 can happen

        # CASE 2: One parent passes the mutated gene and the other not
        # ---------and no mutation occurs
        p_one_two.append(
            0.5*1
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

        # CASE 3: Both parents pass on the mutated gene
        # --------and one mutation occurs
        p_one_two.append(
            0.5*1
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        p_one_gene = numpy.sum(p_one_two)

    elif ((A in two) and (B in two)):
        # Both parents pass on the mutated gene
        # --------and one mutation
        p_one_gene = (
            1*1
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))


#################### END ONE GENE ##############################
    return p_one_gene
################################################################
################################################################


################     TWO GENES    ###############################
# everyone in set `two_genes` has two copies of the gene
################     2 GENES    ################################

def prob_two_genes(A, B, zero, one, two):

    ##############################################################
    # CASE 0 - 0
    ##############################################################

    if (A in zero) and (B in zero):
        # Both parent pass a non-mutated gene
        # and both genes mutate
        p_two_genes = (
            1*1
            * PROBS["mutation"]
            * PROBS["mutation"])

    ###############################################################
    # CASE 0 - 1 / 1 - 0
    ###############################################################
    elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
        p_zero_one = []  # CASE 1 OR CASE 2 can happen

        # CASE 1: One parent passes the mutated gene the other no
        # ------- and one mutation
        p_zero_one.append(
            1*0.5
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        # CASE 2: Both parents pass a non mutated gene
        # --------and two mutations.
        p_zero_one.append(
            1*0.5
            * PROBS["mutation"]
            * PROBS["mutation"])

        p_two_genes = numpy.sum(p_zero_one)

    ########################################################################
    # CASE 0 - 2 / 2 - 0
    ########################################################################
    elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
        # One parent passes the mutated gene the other not
        # --- and one mutation.
        p_two_genes = (
            1*1
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

    ########################################################################
    # CASE 1 - 1
    ########################################################################
    elif ((A in one) and (B in one)):
        p_one_one = []  # CASE 1 OR CASE 2 OR CASE 3 can happen

        # CASE 1 - Both parent pass a mutated gene
        # ---------and no mutation occurs.
        p_one_one.append(
            0.5*0.5
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

        # CASE 2: One parent passes the mutated gene the other not
        # ------- and one mutation.
        p_one_one.append(
            0.5*0.5
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        # CASE 3: Both parents pass a non mutated gene
        # --------and two mutations
        p_one_one.append(
            0.5*0.5
            * PROBS["mutation"]
            * PROBS["mutation"])

        p_two_genes = numpy.sum(p_one_one)

    ########################################################################
    # CASE 1 - 2 / 2 - 1
    ########################################################################
    elif ((A in one) and (B in two)) or ((A in two) and (B in one)):

        p_one_two = []  # CASE 1 OR CASE 2 can happen

        # CASE 1 - Both parent pass a mutated gene
        # ---------and no mutation occurs.
        p_one_two.append(
            0.5*1
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

        # CASE 2: One parent passes the mutated gene the other not
        # ------- and one mutation occurs.
        p_one_two.append(
            0.5*1
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        p_two_genes = numpy.sum(p_one_two)

    ########################################################################
    # CASE 2 - 2
    ########################################################################

    elif ((A in two) and (B in two)):

        # Both parent pass a mutated gene
        # ---------and no mutation occurs.
        p_two_genes = (
            1*1
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

#################### END TWO GENES #############################
    return p_two_genes
################################################################
################################################################


################     ZERO GENE    ###############################
#  everyone not in `one_gene` or `two_gene` does not have the gene
################    ZERO GENE    ################################

def prob_zero_gene(A, B, zero, one, two):

    ########################################################################
    # CASE 0 - 0
    ########################################################################
    if (A in zero) and (B in zero):
        # Both parent pass a non-mutated gene
        # and no mutation
        p_zero_gene = (
            1*1
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))


    ########################################################################
    # CASE 0 - 1 / 1 - 0
    ########################################################################
    elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):

        p_zero_one = [] # CASE 1 OR CASE 2

        # CASE 1
        # Both parents pass a non mutated gene
        # .....and no mutation
        p_zero_one.append(
            1*0.5
            *(1-PROBS["mutation"])
            *(1-PROBS["mutation"]))

        # CASE 2
        # One parent passes the mutated gene, the other not
        # -----and one mutation
        p_zero_one.append(
            1*0.5
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        p_zero_gene = numpy.sum(p_zero_one)

    ########################################################################
    # CASE 0 - 2 / 2 - 0
    ########################################################################
    elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
        p_zero_gene = (
            1*1
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))


    ########################################################################
    # CASE 1 - 1
    ########################################################################
    elif ((A in one) and (B in one)):

        p_one_one = []  # CASE 1 OR CASE 2 OR CASE 3 can happen

        # CASE 1 - Both parent pass a non-mutated gene
        # ---------and no mutation occurs after transmission.
        p_one_one.append(
            0.5*0.5
            * (1-PROBS["mutation"])
            * (1-PROBS["mutation"]))

        # CASE 2: One parent passes the mutated gene, the other passes a non-mutated gene
        # ------- and one mutation
        p_one_one.append(
            0.5*0.5
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        # CASE 3: Both parents pass a mutated gene
        # --------and two mutations
        p_one_one.append(
            0.5*0.5
            * PROBS["mutation"]
            * PROBS["mutation"])

        p_zero_gene = numpy.sum(p_one_one)

    ########################################################################
    # CASE 1 - 2 / 2 - 1
    ########################################################################
    elif ((A in one) and (B in two)) or ((A in two) and (B in one)):
        p_one_two = []  # CASE 1 OR CASE 2 OR CASE 3 can happen

        # CASE 2: One parent passes the mutated gene, the other passes a non-mutated gene
        # ------- and one mutation

        # CASE 3: Both parents pass a mutated gene
        # --------and two mutations

        # CASE 2
        p_one_two.append(
            0.5*1
            * PROBS["mutation"]
            * (1-PROBS["mutation"]))

        # CASE 3
        p_one_two.append(
            1*0.5
            * PROBS["mutation"]
            * PROBS["mutation"])

        p_zero_gene = numpy.sum(p_one_two)

    ########################################################################
    # CASE 2 - 2
    ########################################################################

    elif ((A in two) and (B in two)):
        # Both parents pass a mutated gene
        # --------and two mutations
        p_zero_gene = (
            1*1
            * PROBS["mutation"]
            * PROBS["mutation"])

#################### END ZERO GENES #############################
    return p_zero_gene
################################################################
################################################################


if __name__ == "__main__":
    main()