Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy import stats
- da = pd.read_csv("C:/Users/eli/Desktop/YtPruboBEemdqA7UJJ_tgg_63e179e3722f4ef783f58ff6e395feb7_nhanes_2015_2016.csv")
- da["DMDEDUC2x"] = da.DMDEDUC2.replace({1: "<9", 2: "9-11", 3: "HS/GED", 4: "Some college/AA", 5: "College",
- 7: "Refused", 9: "Don't know"})
- da["DMDMARTLx"] = da.DMDMARTL.replace({1: "Married", 2: "Widowed", 3: "Divorced", 4: "Separated", 5: "Never married",
- 6: "Living w/partner", 77: "Refused"})
- db = da.loc[(da.DMDEDUC2x != "Don't know") & (da.DMDMARTLx != "Refused"), :]
- # Now we can create a contingency table, counting the number of people in each cell defined by a combination of education and marital status.
- x = pd.crosstab(db.DMDEDUC2x, da.DMDMARTLx)
- # Normalize data
- # A contingency table can be normalized in three ways -- we can make the rows sum to 1,
- # the columns sum to 1, or the whole table sum to 1. Below we normalize within rows.
- # This gives us the proportion of people in each educational attainment category who fall into each group of
- # the marital status variable.
- # Normalizing within the rows.
- x.apply(lambda z: z/z.sum(), axis=1)
- print(x)
- # We can also normalize within the columns.
- x.apply(lambda z: z/z.sum(), axis=0)
- # # The following line does these steps, reading the code from left to right:
- # 1 Group the data by every combination of gender, education, and marital status
- # 2 Count the number of people in each cell using the 'size' method
- # 3 Pivot the marital status results into the columns (using unstack)
- # 4 Fill any empty cells with 0
- # 5 Normalize the data by row
- b = da.groupby(["RIAGENDRx", "DMDEDUC2x", "DMDMARTLx"]).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1)
- print(b.loc[:, ["Married"]].unstack())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement