Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import itertools
- import sys
- import numpy
- PROBS = {
- # Unconditional probabilities for having gene
- "gene": {
- 2: 0.01,
- 1: 0.03,
- 0: 0.96
- },
- "trait": {
- # Probability of trait given two copies of gene
- 2: {
- True: 0.65,
- False: 0.35
- },
- # Probability of trait given one copy of gene
- 1: {
- True: 0.56,
- False: 0.44
- },
- # Probability of trait given no gene
- 0: {
- True: 0.01,
- False: 0.99
- }
- },
- # Mutation probability
- "mutation": 0.01
- }
- def main():
- # Check for proper usage
- if len(sys.argv) != 2:
- sys.exit("Usage: python heredity.py data.csv")
- people = load_data(sys.argv[1])
- # Keep track of gene and trait probabilities for each person
- probabilities = {
- person: {
- "gene": {
- 2: 0,
- 1: 0,
- 0: 0
- },
- "trait": {
- True: 0,
- False: 0
- }
- }
- for person in people
- }
- # Loop over all sets of people who might have the trait
- names = set(people)
- for have_trait in powerset(names):
- # Check if current set of people violates known information
- fails_evidence = any(
- (people[person]["trait"] is not None and
- people[person]["trait"] != (person in have_trait))
- for person in names
- )
- if fails_evidence:
- continue
- # Loop over all sets of people who might have the gene
- for one_gene in powerset(names):
- for two_genes in powerset(names - one_gene):
- # Update probabilities with new joint probability
- p = joint_probability(people, one_gene, two_genes, have_trait)
- update(probabilities, one_gene, two_genes, have_trait, p)
- # Ensure probabilities sum to 1
- normalize(probabilities)
- # Print results
- for person in people:
- print(f"{person}:")
- for field in probabilities[person]:
- print(f" {field.capitalize()}:")
- for value in probabilities[person][field]:
- p = probabilities[person][field][value]
- print(f" {value}: {p:.4f}")
- def load_data(filename):
- """
- Load gene and trait data from a file into a dictionary.
- File assumed to be a CSV containing fields name, mother, father, trait.
- mother, father must both be blank, or both be valid names in the CSV.
- trait should be 0 or 1 if trait is known, blank otherwise.
- """
- data = dict()
- with open(filename) as f:
- reader = csv.DictReader(f)
- for row in reader:
- name = row["name"]
- data[name] = {
- "name": name,
- "mother": row["mother"] or None,
- "father": row["father"] or None,
- "trait": (True if row["trait"] == "1" else
- False if row["trait"] == "0" else None)
- }
- return data
- def powerset(s):
- """
- Return a list of all possible subsets of set s.
- """
- s = list(s)
- return [
- set(s) for s in itertools.chain.from_iterable(
- itertools.combinations(s, r) for r in range(len(s) + 1)
- )
- ]
- def joint_probability(people, one, two, trait):
- """
- Compute and return a joint probability.
- The probability returned should be the probability that
- * everyone in set `one_gene` has one copy of the gene, and
- * everyone in set `two_genes` has two copies of the gene, and
- * everyone not in `one_gene` or `two_gene` does not have the gene, and
- * everyone in set `have_trait` has the trait, and
- * everyone not in set` have_trait` does not have the trait.
- """
- # joint probability #
- p = []
- # set of persons with no copy of the mutated gene
- zero = set(people) - one - two
- no_trait = set(people)-trait
- for person in one:
- if hasNoParents(people, person):
- p_one_gene = PROBS["gene"][1]
- else:
- parent_A = people[person]["father"]
- parent_B = people[person]["mother"]
- p_one_gene = prob_one_gene(
- parent_A, parent_B, zero, one, two)
- p.append(p_one_gene)
- for person in two:
- if hasNoParents(people, person):
- p_two_genes = PROBS["gene"][2]
- else:
- parent_A = people[person]["father"]
- parent_B = people[person]["mother"]
- p_two_genes = prob_two_genes(
- parent_A, parent_B, zero, one, two)
- p.append(p_two_genes)
- for person in zero:
- if hasNoParents(people, person):
- p_zero_gene = PROBS["gene"][0]
- else:
- parent_A = people[person]["father"]
- parent_B = people[person]["mother"]
- p_zero_gene = prob_zero_gene(
- parent_A, parent_B, zero, one, two)
- p.append(p_zero_gene)
- for person in trait:
- if person in one:
- p_trait = PROBS["trait"][1][True]
- elif person in two:
- p_trait = PROBS["trait"][2][True]
- else:
- p_trait = PROBS["trait"][0][True]
- p.append(p_trait)
- for person in no_trait:
- if person in one:
- p_no_trait = PROBS["trait"][1][False]
- elif person in two:
- p_no_trait = PROBS["trait"][2][False]
- else:
- p_no_trait = PROBS["trait"][0][False]
- p.append(p_no_trait)
- return numpy.prod(p)
- def update(probabilities, one_gene, two_genes, have_trait, p):
- """
- Add to `probabilities` a new joint probability `p`.
- Each person should have their "gene" and "trait" distributions updated.
- Which value for each distribution is updated depends on whether
- the person is in `have_gene` and `have_trait`, respectively.
- """
- for person in probabilities.keys():
- if person in one_gene:
- probabilities[person]["gene"][1] += p
- elif person in two_genes:
- probabilities[person]["gene"][2] += p
- else: # Person has zero gene
- probabilities[person]["gene"][0] += p
- if person in have_trait:
- probabilities[person]["trait"][True] += p
- else:
- probabilities[person]["trait"][False] += p
- def normalize(probabilities):
- """
- Update `probabilities` such that each probability distribution
- is normalized (i.e., sums to 1, with relative proportions the same).
- """
- people = probabilities.keys()
- for person in people:
- s1 = sum(probabilities[person]["gene"].values())
- coef1 = 1/s1
- for (k1, value1) in probabilities[person]["gene"].items():
- new_value_1 = value1*coef1
- probabilities[person]['gene'][k1] = new_value_1
- s2 = sum(probabilities[person]["trait"].values())
- coef2 = 1/s2
- for (k2, value2) in probabilities[person]['trait'].items():
- new_value_2 = value2*coef2
- probabilities[person]['trait'][k2] = new_value_2
- def hasNoParents(people, person):
- return people[person]["father"] is None
- ################ ONE GENE ################################
- # everyone in set `one_gene` has one copy of the gene
- ################ ONE GENE ################################
- def prob_one_gene(A, B, zero, one, two):
- if (A in zero) and (B in zero):
- p_one_gene = (
- 1*1
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
- p_zero_one = []
- # CASE 1 - Both parent pass a non mutated gene
- # ---------and one mutation.
- p_zero_one.append(
- 1*0.5
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- # CASE 2: One parent passes the mutated gene and the other not
- # ---------and no mutation.
- p_zero_one.append(
- 1*0.5
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- p_one_gene = numpy.sum(p_zero_one)
- elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
- # One parent passes the mutated gene and the other not
- # --no mutation
- p_one_gene = (
- 1*1
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- elif ((A in one) and (B in one)):
- p_one_one = []
- # CASE 1 - Both parent pass a non mutated gene
- # ---------and one mutation
- # CASE 2: One parent passes the mutated gene and the other not
- # ---------and no mutation
- # CASE 3: Both parents pass on the mutated gene
- # --------and one mutation
- # CASE 1
- p_one_one.append(
- 0.5*0.5
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- # CASE 2
- p_one_one.append(
- 0.5*0.5
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- # CASE 3
- p_one_one.append(
- 0.5*0.5
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- p_one_gene = numpy.sum(p_one_one)
- elif ((A in one) and (B in two)) or ((A in two) and (B in one)):
- p_one_two = [] # CASE 1 OR CASE 2 can happen
- # CASE 2: One parent passes the mutated gene and the other not
- # ---------and no mutation occurs
- p_one_two.append(
- 0.5*1
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- # CASE 3: Both parents pass on the mutated gene
- # --------and one mutation occurs
- p_one_two.append(
- 0.5*1
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- p_one_gene = numpy.sum(p_one_two)
- elif ((A in two) and (B in two)):
- # Both parents pass on the mutated gene
- # --------and one mutation
- p_one_gene = (
- 1*1
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- #################### END ONE GENE ##############################
- return p_one_gene
- ################################################################
- ################################################################
- ################ TWO GENES ###############################
- # everyone in set `two_genes` has two copies of the gene
- ################ 2 GENES ################################
- def prob_two_genes(A, B, zero, one, two):
- ##############################################################
- # CASE 0 - 0
- ##############################################################
- if (A in zero) and (B in zero):
- # Both parent pass a non-mutated gene
- # and both genes mutate
- p_two_genes = (
- 1*1
- * PROBS["mutation"]
- * PROBS["mutation"])
- ###############################################################
- # CASE 0 - 1 / 1 - 0
- ###############################################################
- elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
- p_zero_one = [] # CASE 1 OR CASE 2 can happen
- # CASE 1: One parent passes the mutated gene the other no
- # ------- and one mutation
- p_zero_one.append(
- 1*0.5
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- # CASE 2: Both parents pass a non mutated gene
- # --------and two mutations.
- p_zero_one.append(
- 1*0.5
- * PROBS["mutation"]
- * PROBS["mutation"])
- p_two_genes = numpy.sum(p_zero_one)
- ########################################################################
- # CASE 0 - 2 / 2 - 0
- ########################################################################
- elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
- # One parent passes the mutated gene the other not
- # --- and one mutation.
- p_two_genes = (
- 1*1
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- ########################################################################
- # CASE 1 - 1
- ########################################################################
- elif ((A in one) and (B in one)):
- p_one_one = [] # CASE 1 OR CASE 2 OR CASE 3 can happen
- # CASE 1 - Both parent pass a mutated gene
- # ---------and no mutation occurs.
- p_one_one.append(
- 0.5*0.5
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- # CASE 2: One parent passes the mutated gene the other not
- # ------- and one mutation.
- p_one_one.append(
- 0.5*0.5
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- # CASE 3: Both parents pass a non mutated gene
- # --------and two mutations
- p_one_one.append(
- 0.5*0.5
- * PROBS["mutation"]
- * PROBS["mutation"])
- p_two_genes = numpy.sum(p_one_one)
- ########################################################################
- # CASE 1 - 2 / 2 - 1
- ########################################################################
- elif ((A in one) and (B in two)) or ((A in two) and (B in one)):
- p_one_two = [] # CASE 1 OR CASE 2 can happen
- # CASE 1 - Both parent pass a mutated gene
- # ---------and no mutation occurs.
- p_one_two.append(
- 0.5*1
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- # CASE 2: One parent passes the mutated gene the other not
- # ------- and one mutation occurs.
- p_one_two.append(
- 0.5*1
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- p_two_genes = numpy.sum(p_one_two)
- ########################################################################
- # CASE 2 - 2
- ########################################################################
- elif ((A in two) and (B in two)):
- # Both parent pass a mutated gene
- # ---------and no mutation occurs.
- p_two_genes = (
- 1*1
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- #################### END TWO GENES #############################
- return p_two_genes
- ################################################################
- ################################################################
- ################ ZERO GENE ###############################
- # everyone not in `one_gene` or `two_gene` does not have the gene
- ################ ZERO GENE ################################
- def prob_zero_gene(A, B, zero, one, two):
- ########################################################################
- # CASE 0 - 0
- ########################################################################
- if (A in zero) and (B in zero):
- # Both parent pass a non-mutated gene
- # and no mutation
- p_zero_gene = (
- 1*1
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- ########################################################################
- # CASE 0 - 1 / 1 - 0
- ########################################################################
- elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
- p_zero_one = [] # CASE 1 OR CASE 2
- # CASE 1
- # Both parents pass a non mutated gene
- # .....and no mutation
- p_zero_one.append(
- 1*0.5
- *(1-PROBS["mutation"])
- *(1-PROBS["mutation"]))
- # CASE 2
- # One parent passes the mutated gene, the other not
- # -----and one mutation
- p_zero_one.append(
- 1*0.5
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- p_zero_gene = numpy.sum(p_zero_one)
- ########################################################################
- # CASE 0 - 2 / 2 - 0
- ########################################################################
- elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
- p_zero_gene = (
- 1*1
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- ########################################################################
- # CASE 1 - 1
- ########################################################################
- elif ((A in one) and (B in one)):
- p_one_one = [] # CASE 1 OR CASE 2 OR CASE 3 can happen
- # CASE 1 - Both parent pass a non-mutated gene
- # ---------and no mutation occurs after transmission.
- p_one_one.append(
- 0.5*0.5
- * (1-PROBS["mutation"])
- * (1-PROBS["mutation"]))
- # CASE 2: One parent passes the mutated gene, the other passes a non-mutated gene
- # ------- and one mutation
- p_one_one.append(
- 0.5*0.5
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- # CASE 3: Both parents pass a mutated gene
- # --------and two mutations
- p_one_one.append(
- 0.5*0.5
- * PROBS["mutation"]
- * PROBS["mutation"])
- p_zero_gene = numpy.sum(p_one_one)
- ########################################################################
- # CASE 1 - 2 / 2 - 1
- ########################################################################
- elif ((A in one) and (B in two)) or ((A in two) and (B in one)):
- p_one_two = [] # CASE 1 OR CASE 2 OR CASE 3 can happen
- # CASE 2: One parent passes the mutated gene, the other passes a non-mutated gene
- # ------- and one mutation
- # CASE 3: Both parents pass a mutated gene
- # --------and two mutations
- # CASE 2
- p_one_two.append(
- 0.5*1
- * PROBS["mutation"]
- * (1-PROBS["mutation"]))
- # CASE 3
- p_one_two.append(
- 1*0.5
- * PROBS["mutation"]
- * PROBS["mutation"])
- p_zero_gene = numpy.sum(p_one_two)
- ########################################################################
- # CASE 2 - 2
- ########################################################################
- elif ((A in two) and (B in two)):
- # Both parents pass a mutated gene
- # --------and two mutations
- p_zero_gene = (
- 1*1
- * PROBS["mutation"]
- * PROBS["mutation"])
- #################### END ZERO GENES #############################
- return p_zero_gene
- ################################################################
- ################################################################
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement