Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- import scipy as sp
- da = pd.read_csv(
- "C:/Users/eli/Desktop/YtPruboBEemdqA7UJJ_tgg_63e179e3722f4ef783f58ff6e395feb7_nhanes_2015_2016.csv")
- # First solution:
- # jointplot returns a jointgrid, which we need to assign to a variable in order to add an annotation
- # This line is almost like the original, but it seems that fill is needed explicitly now.
- # And most importantly, ".annotate" is not just deprecated. It's gone.
- jg = sns.jointplot(x='BMXLEG', y='BMXARML', data=da, kind='kde', fill=True)
- # To get the correlation, we need to consider only the records with NA values for either measurement.
- da_no_nulls = da[['BMXLEG', 'BMXARML']].dropna()
- pearsonr, p = sp.stats.pearsonr(da_no_nulls.BMXLEG, da_no_nulls.BMXARML)
- pearson_str = f'pearsonr = {pearsonr:.2f}; p = {p}'
- # Placing the annotation somewhere readable requires that we find the max of the axes
- jg.ax_joint.text(
- jg.ax_joint._axes.xaxis.get_data_interval()[1],
- jg.ax_joint._axes.yaxis.get_data_interval()[1],
- pearson_str,
- horizontalalignment='right')
- plt.show()
- # Second solution:
- # create plot
- g = sns.jointplot(x="BMXLEG", y="BMXARML", kind='kde', data=da)
- # create mask to omit NaN values from pearson calc
- nonan = ~np.logical_or(np.isnan(da.BMXLEG), np.isnan(da.BMXARML))
- # calc pearson value and pvalue
- r, p = sp.stats.pearsonr(da.BMXLEG[nonan], da.BMXARML[nonan])
- # place pearson value
- g.ax_joint.annotate(f'$\\rho = {r:.3f}$', xy=(
- 0.1, 0.9), xycoords='axes fraction')
- plt.show()
- # How to create correlation coefficiant for many plots
- ''' Question 2
- Construct a grid of scatterplots between the first systolic and the first diastolic blood pressure measurement.
- Stratify the plots by gender (rows) and by race/ethnicity groups (columns).'''
- da["RIAGENDR"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})
- # drop all nulls
- da_no_nulls = da[["BPXDI1", "BPXDI2", "RIAGENDR", "RIDRETH1"]].dropna()
- # create plot
- g = sns.lmplot(x = "BPXDI1", y = "BPXDI2", data = da_no_nulls, row = "RIAGENDR", col = "RIDRETH1",height=3, aspect=1)
- # create function about how to calculate correlation
- def annotate(data, **kws):
- r, p = sp.stats.pearsonr(data["BPXDI1"], data["BPXDI2"])
- ax = plt.gca()
- ax.text(.05, .8, 'r={:.2f}, p={:.2g}'.format(r, p),
- transform=ax.transAxes)
- # map correlation to every plot
- g.map_dataframe(annotate)
- ##############################################
- # Correlation Analysis
- df_no_nulls_gasoline = df_gasoline[["Years Automobile", "Price BGN", "Kilometers", "Horsepower"]].dropna()
- g = sns.lmplot(x = "Price BGN", y = "Horsepower", data = df_no_nulls_gasoline, height=15, aspect=1)
- g.fig.suptitle('Gasoline engines: Price BGN vs Horsepower')
- plt.show()
- round(np.corrcoef(df['Gross'], df['Rating'])[1][0], 2)
- np.round(df.corr(), 2)
- sns.heatmap(np.round(df.corr(), 2), annot = True)
- sns.heatmap(df.corr(), annot = True)
- plt.show()
- ###################################################################################################################################
- # Cleaning Data
- import pandas as pd
- import numpy as np
- # Plot the data set
- cars.plot(subplots = True, figsize = (15,12))
- plt.show()
- # Detecting missing values
- titanic.info()
- titanic.isna().sum(axis = 0)
- titanic.isna().any(axis = 1)
- titanic[titanic.isna().any(axis = 1)]
- titanic.notna().sum(axis = 0)
- titanic.notna().all(axis = 0)
- titanic.age.value_counts(dropna = False)
- ######################################################################################################################
- # Visualizate missing values
- sns.heatmap(titanic.notna())
- plt.show()
- def percent_missing(df):
- percent_nan = 100* df.isnull().sum() / len(df)
- percent_nan = percent_nan[percent_nan>0].sort_values()
- return percent_nan
- percent_nan = percent_missing(df)
- sns.barplot(x=percent_nan.index,y=percent_nan)
- plt.xticks(rotation=90);
- sns.barplot(x=percent_nan.index,y=percent_nan)
- plt.xticks(rotation=90);
- plt.ylim(0,1) # Set 1% Threshold
- percent_nan[percent_nan < 1]
- #######################################################################################################################
- my_data = pd.read_excel('D:\student-dropna_1.xlsx', index_col='id')
- # NA and Missing are customize missing values
- my_data = my_data.replace('NA', np.nan)
- my_data = my_data.replace('Missing', np.nan)
- my_data = my_date.replace(r'^\s*$', np.nan, regex=True) # replace all blank spaces with np.nan
- df_auto = df_auto[df_auto['Year Manifacture'].notna()] # !!!! remove np.nan
- # And after that:
- # 1a.Drop all rows where we have at least one missing values:
- titanic.dropna()
- # 1b.Drop all columns where we have at lest one missing values:
- titanic.dropna(axis = 1)
- # 1c.Drop all rows with missing values where the whole row is with missing value:
- titanic.dropna(axis = 0, how = 'all')
- #Drop all columns with less than 3000 non-null values
- titanic.dropna(axis = 1, thresh = 3000, inplace = True)
- # Filter df for all rows with NA-values in the Last_Price column
- mask1 = titanic['Last_Price'].isna()
- no_price = titanic.loc[mask1]
- # 2.Keep only the rows having 2 or more valid data
- my_data = my_data.dropna(how='any', axis=0, thresh=2)
- # 2a.Drop all rows where we have less than 4 missing values in these four columns:
- titanic.dropna(axis = 0, subset = ['Survived', 'Class', 'Gender', 'Age'], tresh = 4)
- # 3.Keep only the rows having 3 or more valid data
- my_data = my_data.dropna(how='any', axis=0, thresh=3)
- # 4.Keep only columns where 11 or more valid data is available
- my_data = my_data.dropna(how='any', axis=1, thresh=11)
- # 5.Keep only rows where 70% or more valid data is available
- my_data = my_data.dropna(how='any', axis=0, thresh=my_data.shape[1]*0.7)
- # 6.Keep only columns where 80% or more valid data is available
- my_data = my_data.dropna(how='any', axis=1, thresh=my_data.shape[0]*0.8)
- # 7.How to take the rows where specific column is not NaN
- df = df[df['EPS'].notna()]
- df = df.dropna(axis = 'index', how = 'all', subset = ['email', 'lastName']) # the row to be droped all of two have to be missing
- # 8.How to fill in missing data in column:
- df['column'] = df['column'].fillna(0)
- df.fillna(df.mean(), inplace = True ) # only for numerical columns (Step 1)
- df.fillna(df.mode().iloc[0], inplace = True) # only for categorical columns (Step 2); iloc[0], because only first row has data and mode
- df.groupby('Neighborhood')['Lot Frontage'].transform(lambda val: val.fillna(val.mean())) # fill values after groupby
- # 9.How to drop all rows that have missing value in all their columns:
- my_data = my_data.dropna(axis='index', how='all')
- # 10.How to drop all rows that have at least one missing values in these specific columns:
- my_data = my_data.dropna(axis= 0, subset['Age','Color'], how='any')
- # 11.How to drop all rows that have less than 2 missing values in these specific columns:
- my_data = my_data.dropna(axis= 0, subset['Age','Color'], thresh = 2)
- # 12.How to drop all rows that have at least 1 missing values:
- my_date = my_data.dropna()
- ##############################################################################
- # 13.Find duplicates rows:
- titanic[titanic.duplicated(keep = False)] # HOW TO SEE THE DUPLICATES
- df.duplicated(keep = 'first', subset = ['Age', 'Color').sum() # how to count the duplicates in specific columns
- df.duplicated()
- df.duplicated(keep = 'first').sum() # how to count the duplicates
- df[df.duplicated(keep = False)]
- # 14.Drop duplicates (be careful!):
- df.drop_duplicates(keep = 'first', ignore_index = True, inplace = True ) # how to restore the index
- df.drop(index = [495,324,543], inplace = True )
- df.drop_duplicates(keep = 'first', subset = ['Name'], inplace = True) # drop duplicates in the specific column
- # 15.Find duplicates in the column:
- df.duplicated(keep = 'first', subset=['brand'])
- # 16.How to see the missing values in the columns:
- df.isna().sum() / len(df)
- # 17.How to visualize the missing values:
- sns.heatmap(df.isnull(), cbar = False)
- # 18.How to fill Nan value with its previous value:
- df1 = df.fillna(method = 'ffill')
- # 19.How to fill Nan value with its next value:
- df1 = df.fillna(method = 'bfill')
- # 20.How do drop all rows with 'None" value
- df_jobs = df_jobs.replace(to_replace='None', value=np.nan).dropna()
- ##################################################################################################################################
- # currently married vs who are not currently married
- df['DMDMARTL'] = df['DMDMARTL'].map({1:1, 2:0, 3:0, 4:0, 5:0, 6:0})
- # have completed college vs don't have completed college
- df['DMDEDUC2'] = df['DMDEDUC2'].map({1:0, 2:0, 3:0, 4:1, 5:1})
- dz = df[['DMDMARTL', 'DMDEDUC2']].dropna() # Unmapped values from above become NA
- ##################################################
- # How to plot missing data
- plt.figure(figsize = (12,8))
- sns.heatmap(titanic.notna())
- plt.show()
- ##################################################################################################################################
- # How to replace NA values by group-specific Values
- mean_age = titanic.age.mean()
- titanic.age.fillna(mean_age)
- titanic['group_mean_age'] = titanic.groupby(['sex', 'pclass']).age.transform('mean')
- titanic.age.fillna(titanic.group_mean_age, inplace = True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement