Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy import stats
- plt.style.available
- plt.style.use("seaborn) # how to choose the style
- #############################################################
- titanic.plot(kind = 'scatter', figsize = (15,8), x = 'age', y = 'fare', c = 'survived',marker = 'x', s = 20, colormap= 'viridis')
- plt.show()
- #################################
- cars.plot(kind = 'scatter', x = 'horsepower', y = 'mpg', figsize = (12,8),c ='cylinders', marker = 'x',colormap ='viridis')
- plt.title('Horsepower vs MPG', fontsize = 18)
- plt.xlabel("horsepower", fontsize = 15)
- plt.ylabel("mpg", fontsize = 15)
- plt.show()
- plt.scatter(df["Length"], df["Height"], marker = "D")
- plt.title("Relationship between Length and Height")
- plt.show()
- #################################
- da = pd.read_csv(
- "C:/Users/eli/Desktop/YtPruboBEemdqA7UJJ_tgg_63e179e3722f4ef783f58ff6e395feb7_nhanes_2015_2016.csv")
- '''
- Question 1
- Make a scatterplot showing the relationship between the first and second measurements of diastolic blood pressure (BPXDI1 and BPXDI2).
- Also obtain the 4x4 matrix of correlation coefficients among the first two systolic and the first two diastolic blood pressure measures.
- '''
- sns.scatterplot(data=da, x="BPXDI1", y="BPXDI2", alpha=0.3)
- # Most of the data is concentrated between 40 and 100 BPXDI1 and between 40 and 100 BPXDI2
- df = da.loc[:1, ["BPXDI1", "BPXDI2"]]
- df.corr()
- ''' BPXDI1 BPXDI2
- BPXDI1 1.0 1.0
- BPXDI2 1.0 1.0 '''
- '''
- Question 2
- Construct a grid of scatterplots between the first systolic and the first diastolic blood pressure measurement.
- Stratify the plots by gender (rows) and by race/ethnicity groups (columns).
- '''
- da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})
- sns.FacetGrid(da, row="RIAGENDR", col="RIDRETH1").map(
- plt.scatter, "BPXDI1", "BPXDI2", alpha=0.4).add_legend()
- '''
- Question 3
- Use "violin plots" to compare the distributions of ages within groups defined by gender and educational attainment.
- '''
- sns.FacetGrid(da, row="RIAGENDR", col="DMDEDUC2").map(
- sns.violinplot, "RIDAGEYR", alpha=0.4).add_legend()
- '''
- Question 4
- Use violin plots to compare the distributions of BMI within a series of 10-year age bands. Also stratify these plots by gender.
- '''
- da["agegroup"] = pd.cut(da.RIDAGEYR, [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
- sns.FacetGrid(da, row="RIAGENDR", col="agegroup").map(
- sns.violinplot, "BMXBMI", alpha=0.4).add_legend()
- '''
- Question 5
- Construct a frequency table for the joint distribution of ethnicity groups (RIDRETH1) and health-insurance status (HIQ210).
- Normalize the results so that the values within each ethnic group are proportions that sum to 1.
- '''
- x = pd.crosstab(da.RIDRETH1, da.HIQ210)
- x.apply(lambda z: z/z.sum(), axis=1)
- #########################################################
- # FacetGrid
- ''' Create a histogram of the ages grouped by cholesterol levels.
- The plots show that the most people are with normal cholesterol levels. '''
- df['cholesterol'] = df['cholesterol'].replace({1: "normal", 2: "above normal",
- 3: "well above normal"})
- df_age_chol = df[['age', 'cholesterol']]
- g = sns.FacetGrid(df_age_chol, row = 'cholesterol', height = 5, aspect = 3)
- g = g.map(plt.hist, 'age')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement