Advertisement
jules0707

heredity.py

Feb 9th, 2024 (edited)
7
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 18.91 KB | None | 0 0
  1. import csv
  2. import itertools
  3. import sys
  4. import numpy
  5.  
  6. PROBS = {
  7.  
  8.     # Unconditional probabilities for having gene
  9.     "gene": {
  10.         2: 0.01,
  11.         1: 0.03,
  12.         0: 0.96
  13.     },
  14.  
  15.     "trait": {
  16.  
  17.         # Probability of trait given two copies of gene
  18.         2: {
  19.             True: 0.65,
  20.             False: 0.35
  21.         },
  22.  
  23.         # Probability of trait given one copy of gene
  24.         1: {
  25.             True: 0.56,
  26.             False: 0.44
  27.         },
  28.  
  29.         # Probability of trait given no gene
  30.         0: {
  31.             True: 0.01,
  32.             False: 0.99
  33.         }
  34.     },
  35.  
  36.     # Mutation probability
  37.     "mutation": 0.01
  38. }
  39.  
  40.  
  41. def main():
  42.  
  43.     # Check for proper usage
  44.     if len(sys.argv) != 2:
  45.         sys.exit("Usage: python heredity.py data.csv")
  46.     people = load_data(sys.argv[1])
  47.  
  48.     # Keep track of gene and trait probabilities for each person
  49.     probabilities = {
  50.         person: {
  51.             "gene": {
  52.                 2: 0,
  53.                 1: 0,
  54.                 0: 0
  55.             },
  56.             "trait": {
  57.                 True: 0,
  58.                 False: 0
  59.             }
  60.         }
  61.         for person in people
  62.     }
  63.  
  64.     # Loop over all sets of people who might have the trait
  65.     names = set(people)
  66.     for have_trait in powerset(names):
  67.  
  68.         # Check if current set of people violates known information
  69.         fails_evidence = any(
  70.             (people[person]["trait"] is not None and
  71.              people[person]["trait"] != (person in have_trait))
  72.             for person in names
  73.         )
  74.         if fails_evidence:
  75.             continue
  76.  
  77.         # Loop over all sets of people who might have the gene
  78.         for one_gene in powerset(names):
  79.             for two_genes in powerset(names - one_gene):
  80.  
  81.                 # Update probabilities with new joint probability
  82.                 p = joint_probability(people, one_gene, two_genes, have_trait)
  83.                 update(probabilities, one_gene, two_genes, have_trait, p)
  84.  
  85.     # Ensure probabilities sum to 1
  86.     normalize(probabilities)
  87.  
  88.     # Print results
  89.     for person in people:
  90.         print(f"{person}:")
  91.         for field in probabilities[person]:
  92.             print(f"  {field.capitalize()}:")
  93.             for value in probabilities[person][field]:
  94.                 p = probabilities[person][field][value]
  95.                 print(f"    {value}: {p:.4f}")
  96.  
  97.  
  98. def load_data(filename):
  99.     """
  100.    Load gene and trait data from a file into a dictionary.
  101.    File assumed to be a CSV containing fields name, mother, father, trait.
  102.    mother, father must both be blank, or both be valid names in the CSV.
  103.    trait should be 0 or 1 if trait is known, blank otherwise.
  104.    """
  105.     data = dict()
  106.     with open(filename) as f:
  107.         reader = csv.DictReader(f)
  108.         for row in reader:
  109.             name = row["name"]
  110.             data[name] = {
  111.                 "name": name,
  112.                 "mother": row["mother"] or None,
  113.                 "father": row["father"] or None,
  114.                 "trait": (True if row["trait"] == "1" else
  115.                           False if row["trait"] == "0" else None)
  116.             }
  117.     return data
  118.  
  119.  
  120. def powerset(s):
  121.     """
  122.    Return a list of all possible subsets of set s.
  123.    """
  124.     s = list(s)
  125.     return [
  126.         set(s) for s in itertools.chain.from_iterable(
  127.             itertools.combinations(s, r) for r in range(len(s) + 1)
  128.         )
  129.     ]
  130.  
  131.  
  132. def joint_probability(people, one, two, trait):
  133.     """
  134.    Compute and return a joint probability.
  135.  
  136.    The probability returned should be the probability that
  137.        * everyone in set `one_gene` has one copy of the gene, and
  138.        * everyone in set `two_genes` has two copies of the gene, and
  139.        * everyone not in `one_gene` or `two_gene` does not have the gene, and
  140.        * everyone in set `have_trait` has the trait, and
  141.        * everyone not in set` have_trait` does not have the trait.
  142.    """
  143.  
  144.     # joint probability #
  145.     p = []
  146.  
  147.     # set of persons with no copy of the mutated gene
  148.     zero = set(people) - one - two
  149.     no_trait = set(people)-trait
  150.  
  151.     for person in one:
  152.         if hasNoParents(people, person):
  153.             p_one_gene = PROBS["gene"][1]
  154.         else:
  155.             parent_A = people[person]["father"]
  156.             parent_B = people[person]["mother"]
  157.             p_one_gene = prob_one_gene(
  158.                 parent_A, parent_B, zero, one, two)
  159.         p.append(p_one_gene)
  160.  
  161.     for person in two:
  162.         if hasNoParents(people, person):
  163.             p_two_genes = PROBS["gene"][2]
  164.         else:
  165.             parent_A = people[person]["father"]
  166.             parent_B = people[person]["mother"]
  167.             p_two_genes = prob_two_genes(
  168.                 parent_A, parent_B, zero, one, two)
  169.         p.append(p_two_genes)
  170.  
  171.     for person in zero:
  172.         if hasNoParents(people, person):
  173.             p_zero_gene = PROBS["gene"][0]
  174.         else:
  175.             parent_A = people[person]["father"]
  176.             parent_B = people[person]["mother"]
  177.             p_zero_gene = prob_zero_gene(
  178.                 parent_A, parent_B, zero, one, two)
  179.         p.append(p_zero_gene)
  180.  
  181.     for person in trait:
  182.         if person in one:
  183.             p_trait = PROBS["trait"][1][True]
  184.         elif person in two:
  185.             p_trait = PROBS["trait"][2][True]
  186.         else:
  187.             p_trait = PROBS["trait"][0][True]
  188.         p.append(p_trait)
  189.  
  190.     for person in no_trait:
  191.         if person in one:
  192.             p_no_trait = PROBS["trait"][1][False]
  193.         elif person in two:
  194.             p_no_trait = PROBS["trait"][2][False]
  195.         else:
  196.             p_no_trait = PROBS["trait"][0][False]
  197.         p.append(p_no_trait)
  198.  
  199.     return numpy.prod(p)
  200.  
  201.  
  202. def update(probabilities, one_gene, two_genes, have_trait, p):
  203.     """
  204.    Add to `probabilities` a new joint probability `p`.
  205.    Each person should have their "gene" and "trait" distributions updated.
  206.    Which value for each distribution is updated depends on whether
  207.    the person is in `have_gene` and `have_trait`, respectively.
  208.    """
  209.  
  210.     for person in probabilities.keys():
  211.         if person in one_gene:
  212.             probabilities[person]["gene"][1] += p
  213.         elif person in two_genes:
  214.             probabilities[person]["gene"][2] += p
  215.         else:  # Person has zero gene
  216.             probabilities[person]["gene"][0] += p
  217.  
  218.         if person in have_trait:
  219.             probabilities[person]["trait"][True] += p
  220.         else:
  221.             probabilities[person]["trait"][False] += p
  222.  
  223.  
  224. def normalize(probabilities):
  225.     """
  226.    Update `probabilities` such that each probability distribution
  227.    is normalized (i.e., sums to 1, with relative proportions the same).
  228.    """
  229.     people = probabilities.keys()
  230.  
  231.     for person in people:
  232.         s1 = sum(probabilities[person]["gene"].values())
  233.         coef1 = 1/s1
  234.         for (k1, value1) in probabilities[person]["gene"].items():
  235.             new_value_1 = value1*coef1
  236.             probabilities[person]['gene'][k1] = new_value_1
  237.  
  238.         s2 = sum(probabilities[person]["trait"].values())
  239.         coef2 = 1/s2
  240.         for (k2, value2) in probabilities[person]['trait'].items():
  241.             new_value_2 = value2*coef2
  242.             probabilities[person]['trait'][k2] = new_value_2
  243.  
  244.  
  245. def hasNoParents(people, person):
  246.     return people[person]["father"] is None
  247.  
  248.  
  249. ################     ONE GENE    ################################
  250. # everyone in set `one_gene` has one copy of the gene
  251. ################     ONE GENE    ################################
  252.  
  253. def prob_one_gene(A, B, zero, one, two):
  254.  
  255.     if (A in zero) and (B in zero):
  256.         p_one_gene = (
  257.             1*1
  258.             * PROBS["mutation"]
  259.             * (1-PROBS["mutation"]))
  260.  
  261.     elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
  262.         p_zero_one = []
  263.         # CASE 1 - Both parent pass a non mutated gene
  264.         # ---------and one mutation.
  265.         p_zero_one.append(
  266.             1*0.5
  267.             * PROBS["mutation"]
  268.             * (1-PROBS["mutation"]))
  269.  
  270.         # CASE 2: One parent passes the mutated gene and the other not
  271.         # ---------and no mutation.
  272.         p_zero_one.append(
  273.             1*0.5
  274.             * (1-PROBS["mutation"])
  275.             * (1-PROBS["mutation"]))
  276.  
  277.         p_one_gene = numpy.sum(p_zero_one)
  278.  
  279.     elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
  280.         # One parent passes the mutated gene and the other not
  281.         #  --no mutation
  282.         p_one_gene = (
  283.             1*1
  284.             * (1-PROBS["mutation"])
  285.             * (1-PROBS["mutation"]))
  286.  
  287.     elif ((A in one) and (B in one)):
  288.  
  289.         p_one_one = []
  290.         # CASE 1 - Both parent pass a non mutated gene
  291.         # ---------and one mutation
  292.  
  293.         # CASE 2: One parent passes the mutated gene and the other not
  294.         # ---------and no mutation
  295.  
  296.         # CASE 3: Both parents pass on the mutated gene
  297.         # --------and one mutation
  298.  
  299.         # CASE 1
  300.         p_one_one.append(
  301.             0.5*0.5
  302.             * PROBS["mutation"]
  303.             * (1-PROBS["mutation"]))
  304.  
  305.         # CASE 2
  306.         p_one_one.append(
  307.             0.5*0.5
  308.             * (1-PROBS["mutation"])
  309.             * (1-PROBS["mutation"]))
  310.  
  311.         # CASE 3
  312.         p_one_one.append(
  313.             0.5*0.5
  314.             * PROBS["mutation"]
  315.             * (1-PROBS["mutation"]))
  316.  
  317.         p_one_gene = numpy.sum(p_one_one)
  318.  
  319.     elif ((A in one) and (B in two)) or ((A in two) and (B in one)):
  320.  
  321.         p_one_two = []  # CASE 1 OR CASE 2 can happen
  322.  
  323.         # CASE 2: One parent passes the mutated gene and the other not
  324.         # ---------and no mutation occurs
  325.         p_one_two.append(
  326.             0.5*1
  327.             * (1-PROBS["mutation"])
  328.             * (1-PROBS["mutation"]))
  329.  
  330.         # CASE 3: Both parents pass on the mutated gene
  331.         # --------and one mutation occurs
  332.         p_one_two.append(
  333.             0.5*1
  334.             * PROBS["mutation"]
  335.             * (1-PROBS["mutation"]))
  336.  
  337.         p_one_gene = numpy.sum(p_one_two)
  338.  
  339.     elif ((A in two) and (B in two)):
  340.         # Both parents pass on the mutated gene
  341.         # --------and one mutation
  342.         p_one_gene = (
  343.             1*1
  344.             * PROBS["mutation"]
  345.             * (1-PROBS["mutation"]))
  346.  
  347.  
  348. #################### END ONE GENE ##############################
  349.     return p_one_gene
  350. ################################################################
  351. ################################################################
  352.  
  353.  
  354.  
  355. ################     TWO GENES    ###############################
  356. # everyone in set `two_genes` has two copies of the gene
  357. ################     2 GENES    ################################
  358.  
  359. def prob_two_genes(A, B, zero, one, two):
  360.  
  361.     ##############################################################
  362.     # CASE 0 - 0
  363.     ##############################################################
  364.  
  365.     if (A in zero) and (B in zero):
  366.         # Both parent pass a non-mutated gene
  367.         # and both genes mutate
  368.         p_two_genes = (
  369.             1*1
  370.             * PROBS["mutation"]
  371.             * PROBS["mutation"])
  372.  
  373.     ###############################################################
  374.     # CASE 0 - 1 / 1 - 0
  375.     ###############################################################
  376.     elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
  377.         p_zero_one = []  # CASE 1 OR CASE 2 can happen
  378.  
  379.         # CASE 1: One parent passes the mutated gene the other no
  380.         # ------- and one mutation
  381.         p_zero_one.append(
  382.             1*0.5
  383.             * PROBS["mutation"]
  384.             * (1-PROBS["mutation"]))
  385.  
  386.         # CASE 2: Both parents pass a non mutated gene
  387.         # --------and two mutations.
  388.         p_zero_one.append(
  389.             1*0.5
  390.             * PROBS["mutation"]
  391.             * PROBS["mutation"])
  392.  
  393.         p_two_genes = numpy.sum(p_zero_one)
  394.  
  395.     ########################################################################
  396.     # CASE 0 - 2 / 2 - 0
  397.     ########################################################################
  398.     elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
  399.         # One parent passes the mutated gene the other not
  400.         # --- and one mutation.
  401.         p_two_genes = (
  402.             1*1
  403.             * PROBS["mutation"]
  404.             * (1-PROBS["mutation"]))
  405.  
  406.     ########################################################################
  407.     # CASE 1 - 1
  408.     ########################################################################
  409.     elif ((A in one) and (B in one)):
  410.         p_one_one = []  # CASE 1 OR CASE 2 OR CASE 3 can happen
  411.  
  412.         # CASE 1 - Both parent pass a mutated gene
  413.         # ---------and no mutation occurs.
  414.         p_one_one.append(
  415.             0.5*0.5
  416.             * (1-PROBS["mutation"])
  417.             * (1-PROBS["mutation"]))
  418.  
  419.         # CASE 2: One parent passes the mutated gene the other not
  420.         # ------- and one mutation.
  421.         p_one_one.append(
  422.             0.5*0.5
  423.             * PROBS["mutation"]
  424.             * (1-PROBS["mutation"]))
  425.  
  426.         # CASE 3: Both parents pass a non mutated gene
  427.         # --------and two mutations
  428.         p_one_one.append(
  429.             0.5*0.5
  430.             * PROBS["mutation"]
  431.             * PROBS["mutation"])
  432.  
  433.         p_two_genes = numpy.sum(p_one_one)
  434.  
  435.     ########################################################################
  436.     # CASE 1 - 2 / 2 - 1
  437.     ########################################################################
  438.     elif ((A in one) and (B in two)) or ((A in two) and (B in one)):
  439.  
  440.         p_one_two = []  # CASE 1 OR CASE 2 can happen
  441.  
  442.         # CASE 1 - Both parent pass a mutated gene
  443.         # ---------and no mutation occurs.
  444.         p_one_two.append(
  445.             0.5*1
  446.             * (1-PROBS["mutation"])
  447.             * (1-PROBS["mutation"]))
  448.  
  449.         # CASE 2: One parent passes the mutated gene the other not
  450.         # ------- and one mutation occurs.
  451.         p_one_two.append(
  452.             0.5*1
  453.             * PROBS["mutation"]
  454.             * (1-PROBS["mutation"]))
  455.  
  456.         p_two_genes = numpy.sum(p_one_two)
  457.  
  458.     ########################################################################
  459.     # CASE 2 - 2
  460.     ########################################################################
  461.  
  462.     elif ((A in two) and (B in two)):
  463.  
  464.         # Both parent pass a mutated gene
  465.         # ---------and no mutation occurs.
  466.         p_two_genes = (
  467.             1*1
  468.             * (1-PROBS["mutation"])
  469.             * (1-PROBS["mutation"]))
  470.        
  471. #################### END TWO GENES #############################
  472.     return p_two_genes
  473. ################################################################
  474. ################################################################
  475.  
  476.  
  477.  
  478. ################     ZERO GENE    ###############################
  479. #  everyone not in `one_gene` or `two_gene` does not have the gene
  480. ################    ZERO GENE    ################################
  481.  
  482. def prob_zero_gene(A, B, zero, one, two):
  483.  
  484.     ########################################################################
  485.     # CASE 0 - 0
  486.     ########################################################################
  487.     if (A in zero) and (B in zero):
  488.         # Both parent pass a non-mutated gene
  489.         # and no mutation
  490.         p_zero_gene = (
  491.             1*1
  492.             * (1-PROBS["mutation"])
  493.             * (1-PROBS["mutation"]))
  494.  
  495.  
  496.     ########################################################################
  497.     # CASE 0 - 1 / 1 - 0
  498.     ########################################################################
  499.     elif ((A in zero) and (B in one)) or ((A in one) and (B in zero)):
  500.  
  501.         p_zero_one = [] # CASE 1 OR CASE 2
  502.  
  503.         # CASE 1
  504.         # Both parents pass a non mutated gene
  505.         # .....and no mutation
  506.         p_zero_one.append(
  507.             1*0.5
  508.             *(1-PROBS["mutation"])
  509.             *(1-PROBS["mutation"]))
  510.  
  511.         # CASE 2
  512.         # One parent passes the mutated gene, the other not
  513.         # -----and one mutation
  514.         p_zero_one.append(
  515.             1*0.5
  516.             * PROBS["mutation"]
  517.             * (1-PROBS["mutation"]))
  518.        
  519.         p_zero_gene = numpy.sum(p_zero_one)
  520.  
  521.     ########################################################################
  522.     # CASE 0 - 2 / 2 - 0
  523.     ########################################################################
  524.     elif ((A in zero) and (B in two)) or ((A in two) and (B in zero)):
  525.         p_zero_gene = (
  526.             1*1
  527.             * PROBS["mutation"]
  528.             * (1-PROBS["mutation"]))
  529.  
  530.  
  531.     ########################################################################
  532.     # CASE 1 - 1
  533.     ########################################################################
  534.     elif ((A in one) and (B in one)):
  535.  
  536.         p_one_one = []  # CASE 1 OR CASE 2 OR CASE 3 can happen
  537.  
  538.         # CASE 1 - Both parent pass a non-mutated gene
  539.         # ---------and no mutation occurs after transmission.
  540.         p_one_one.append(
  541.             0.5*0.5
  542.             * (1-PROBS["mutation"])
  543.             * (1-PROBS["mutation"]))
  544.  
  545.         # CASE 2: One parent passes the mutated gene, the other passes a non-mutated gene
  546.         # ------- and one mutation
  547.         p_one_one.append(
  548.             0.5*0.5
  549.             * PROBS["mutation"]
  550.             * (1-PROBS["mutation"]))
  551.  
  552.         # CASE 3: Both parents pass a mutated gene
  553.         # --------and two mutations
  554.         p_one_one.append(
  555.             0.5*0.5
  556.             * PROBS["mutation"]
  557.             * PROBS["mutation"])
  558.  
  559.         p_zero_gene = numpy.sum(p_one_one)
  560.  
  561.     ########################################################################
  562.     # CASE 1 - 2 / 2 - 1
  563.     ########################################################################
  564.     elif ((A in one) and (B in two)) or ((A in two) and (B in one)):
  565.         p_one_two = []  # CASE 1 OR CASE 2 OR CASE 3 can happen
  566.  
  567.         # CASE 2: One parent passes the mutated gene, the other passes a non-mutated gene
  568.         # ------- and one mutation
  569.  
  570.         # CASE 3: Both parents pass a mutated gene
  571.         # --------and two mutations
  572.  
  573.         # CASE 2
  574.         p_one_two.append(
  575.             0.5*1
  576.             * PROBS["mutation"]
  577.             * (1-PROBS["mutation"]))
  578.  
  579.         # CASE 3
  580.         p_one_two.append(
  581.             1*0.5
  582.             * PROBS["mutation"]
  583.             * PROBS["mutation"])
  584.  
  585.         p_zero_gene = numpy.sum(p_one_two)
  586.  
  587.     ########################################################################
  588.     # CASE 2 - 2
  589.     ########################################################################
  590.  
  591.     elif ((A in two) and (B in two)):
  592.         # Both parents pass a mutated gene
  593.         # --------and two mutations
  594.         p_zero_gene = (
  595.             1*1
  596.             * PROBS["mutation"]
  597.             * PROBS["mutation"])
  598.  
  599. #################### END ZERO GENES #############################
  600.     return p_zero_gene
  601. ################################################################
  602. ################################################################
  603.  
  604.  
  605.  
  606. if __name__ == "__main__":
  607.     main()
  608.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement