Advertisement
elena1234

Cleaning data and CORRELATION analysis in Python

Apr 19th, 2022 (edited)
964
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.30 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. import scipy as sp
  6.  
  7. da = pd.read_csv(
  8.     "C:/Users/eli/Desktop/YtPruboBEemdqA7UJJ_tgg_63e179e3722f4ef783f58ff6e395feb7_nhanes_2015_2016.csv")
  9.  
  10. # First solution:
  11. # jointplot returns a jointgrid, which we need to assign to a variable in order to add an annotation
  12. # This line is almost like the original, but it seems that fill is needed explicitly now.
  13. # And most importantly, ".annotate" is not just deprecated. It's gone.
  14. jg = sns.jointplot(x='BMXLEG', y='BMXARML', data=da, kind='kde', fill=True)
  15.  
  16. # To get the correlation, we need to consider only the records with NA values for either measurement.
  17. da_no_nulls = da[['BMXLEG', 'BMXARML']].dropna()
  18. pearsonr, p = sp.stats.pearsonr(da_no_nulls.BMXLEG, da_no_nulls.BMXARML)
  19. pearson_str = f'pearsonr = {pearsonr:.2f}; p = {p}'
  20.  
  21. # Placing the annotation somewhere readable requires that we find the max of the axes
  22. jg.ax_joint.text(
  23.     jg.ax_joint._axes.xaxis.get_data_interval()[1],
  24.     jg.ax_joint._axes.yaxis.get_data_interval()[1],
  25.     pearson_str,
  26.     horizontalalignment='right')
  27. plt.show()
  28.  
  29.  
  30. # Second solution:
  31. # create plot
  32. g = sns.jointplot(x="BMXLEG", y="BMXARML", kind='kde', data=da)
  33.  
  34. # create mask to omit NaN values from pearson calc
  35. nonan = ~np.logical_or(np.isnan(da.BMXLEG), np.isnan(da.BMXARML))
  36.  
  37. # calc pearson value and pvalue
  38. r, p = sp.stats.pearsonr(da.BMXLEG[nonan], da.BMXARML[nonan])
  39.  
  40. # place pearson value
  41. g.ax_joint.annotate(f'$\\rho = {r:.3f}$', xy=(
  42.     0.1, 0.9), xycoords='axes fraction')
  43.  
  44. plt.show()
  45.  
  46.  
  47. # How to create correlation coefficiant for many plots
  48. ''' Question 2
  49. Construct a grid of scatterplots between the first systolic and the first diastolic blood pressure measurement.
  50. Stratify the plots by gender (rows) and by race/ethnicity groups (columns).'''
  51.  
  52. da["RIAGENDR"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})
  53.  
  54. # drop all nulls
  55. da_no_nulls = da[["BPXDI1", "BPXDI2", "RIAGENDR", "RIDRETH1"]].dropna()
  56.  
  57. # create plot
  58. g = sns.lmplot(x = "BPXDI1", y = "BPXDI2", data = da_no_nulls, row = "RIAGENDR", col = "RIDRETH1",height=3, aspect=1)
  59.  
  60. # create function about how to calculate correlation
  61. def annotate(data, **kws):
  62.     r, p = sp.stats.pearsonr(data["BPXDI1"], data["BPXDI2"])
  63.     ax = plt.gca()
  64.     ax.text(.05, .8, 'r={:.2f}, p={:.2g}'.format(r, p),
  65.             transform=ax.transAxes)
  66.    
  67. # map correlation to every plot    
  68. g.map_dataframe(annotate)
  69.  
  70.  
  71. ##############################################
  72. # Correlation Analysis
  73. df_no_nulls_gasoline = df_gasoline[["Years Automobile", "Price BGN", "Kilometers", "Horsepower"]].dropna()
  74. g = sns.lmplot(x = "Price BGN", y = "Horsepower", data = df_no_nulls_gasoline, height=15, aspect=1)
  75. g.fig.suptitle('Gasoline engines: Price BGN vs Horsepower')
  76. plt.show()
  77.  
  78.  
  79. round(np.corrcoef(df['Gross'], df['Rating'])[1][0], 2)
  80.  
  81. np.round(df.corr(), 2)
  82. sns.heatmap(np.round(df.corr(), 2), annot = True)
  83.  
  84. sns.heatmap(df.corr(), annot = True)
  85. plt.show()
  86.  
  87.  
  88. ###################################################################################################################################
  89. # Cleaning Data
  90. import pandas as pd
  91. import numpy as np
  92.  
  93. # Plot the data set
  94. cars.plot(subplots = True, figsize = (15,12))
  95. plt.show()
  96.  
  97. # Detecting missing values
  98. titanic.info()
  99. titanic.isna().sum(axis = 0)
  100. titanic.isna().any(axis = 1)
  101. titanic[titanic.isna().any(axis = 1)]
  102. titanic.notna().sum(axis = 0)
  103. titanic.notna().all(axis = 0)
  104. titanic.age.value_counts(dropna = False)
  105.  
  106. ######################################################################################################################
  107. # Visualizate missing values
  108. sns.heatmap(titanic.notna())
  109. plt.show()
  110.  
  111. def percent_missing(df):
  112.     percent_nan = 100* df.isnull().sum() / len(df)
  113.     percent_nan = percent_nan[percent_nan>0].sort_values()
  114.     return percent_nan
  115. percent_nan = percent_missing(df)
  116. sns.barplot(x=percent_nan.index,y=percent_nan)
  117. plt.xticks(rotation=90);
  118.  
  119. sns.barplot(x=percent_nan.index,y=percent_nan)
  120. plt.xticks(rotation=90);
  121. plt.ylim(0,1) # Set 1% Threshold
  122.  
  123. percent_nan[percent_nan < 1]
  124.  
  125. #######################################################################################################################
  126. my_data = pd.read_excel('D:\student-dropna_1.xlsx', index_col='id')
  127.  
  128. # NA and Missing are customize missing values
  129. my_data = my_data.replace('NA', np.nan)
  130. my_data = my_data.replace('Missing', np.nan)
  131. my_data = my_date.replace(r'^\s*$', np.nan, regex=True) # replace all blank spaces with np.nan
  132. df_auto = df_auto[df_auto['Year Manifacture'].notna()] # !!!! remove np.nan
  133.  
  134.  
  135. # And after that:
  136. # 1a.Drop all rows where we have at least one missing values:
  137. titanic.dropna()
  138.  
  139. # 1b.Drop all columns where we have at lest one missing values:
  140. titanic.dropna(axis = 1)
  141.  
  142. # 1c.Drop all rows with missing values where the whole row is with missing value:
  143. titanic.dropna(axis = 0, how = 'all')
  144.  
  145. #Drop all columns with less than 3000 non-null values
  146. titanic.dropna(axis = 1, thresh = 3000, inplace = True)
  147.  
  148. # Filter df for all rows with NA-values in the Last_Price column
  149. mask1 = titanic['Last_Price'].isna()
  150. no_price = titanic.loc[mask1]
  151.  
  152. # 2.Keep only the rows having 2 or more valid data
  153. my_data = my_data.dropna(how='any', axis=0, thresh=2)
  154.  
  155. # 2a.Drop all rows where we have less than 4 missing values in these four columns:
  156. titanic.dropna(axis = 0, subset = ['Survived', 'Class', 'Gender', 'Age'], tresh = 4)
  157.  
  158. # 3.Keep only the rows having 3 or more valid data
  159. my_data = my_data.dropna(how='any', axis=0, thresh=3)
  160.  
  161. # 4.Keep only columns where 11 or more valid data is available
  162. my_data = my_data.dropna(how='any', axis=1, thresh=11)
  163.  
  164. # 5.Keep only rows where 70% or more valid data is available
  165. my_data = my_data.dropna(how='any', axis=0, thresh=my_data.shape[1]*0.7)
  166.  
  167. # 6.Keep only columns where 80% or more valid data is available
  168. my_data = my_data.dropna(how='any', axis=1, thresh=my_data.shape[0]*0.8)
  169.  
  170. # 7.How to take the rows where specific column is not NaN
  171. df = df[df['EPS'].notna()]
  172. df = df.dropna(axis = 'index', how = 'all', subset = ['email', 'lastName']) # the row to be droped all of two have to be missing
  173.  
  174. # 8.How to fill in missing data in column:
  175. df['column'] = df['column'].fillna(0)
  176. df.fillna(df.mean(), inplace = True ) # only for numerical columns (Step 1)
  177. df.fillna(df.mode().iloc[0], inplace = True) # only for categorical columns (Step 2); iloc[0], because only first row has data and mode
  178.  
  179. df.groupby('Neighborhood')['Lot Frontage'].transform(lambda val: val.fillna(val.mean())) # fill values after groupby
  180.  
  181. # 9.How to drop all rows that have missing value in all their columns:
  182. my_data = my_data.dropna(axis='index', how='all')
  183.  
  184. # 10.How to drop all rows that have at least one missing values in these specific columns:
  185. my_data = my_data.dropna(axis= 0, subset['Age','Color'], how='any')
  186.  
  187. # 11.How to drop all rows that have less than 2 missing values in these specific columns:
  188. my_data = my_data.dropna(axis= 0, subset['Age','Color'], thresh = 2)
  189.  
  190. # 12.How to drop all rows that have at least 1 missing values:
  191. my_date = my_data.dropna()
  192.  
  193.  
  194. ##############################################################################
  195. # 13.Find duplicates rows:
  196. titanic[titanic.duplicated(keep = False)] # HOW TO SEE THE DUPLICATES
  197. df.duplicated(keep = 'first', subset = ['Age', 'Color').sum() # how to count the duplicates in specific columns
  198. df.duplicated()
  199. df.duplicated(keep = 'first').sum() # how to count the duplicates
  200. df[df.duplicated(keep = False)]
  201.  
  202. # 14.Drop duplicates (be careful!):
  203. df.drop_duplicates(keep = 'first', ignore_index = True, inplace = True ) # how to restore the index
  204. df.drop(index = [495,324,543], inplace = True )
  205. df.drop_duplicates(keep = 'first', subset = ['Name'], inplace = True) # drop duplicates in the specific column
  206.  
  207. # 15.Find duplicates in the column:
  208. df.duplicated(keep = 'first', subset=['brand'])
  209.  
  210. # 16.How to see the missing values in the columns:
  211. df.isna().sum() / len(df)
  212.  
  213. # 17.How to visualize the missing values:
  214. sns.heatmap(df.isnull(), cbar = False)
  215.  
  216. # 18.How to fill Nan value with its previous value:
  217. df1 = df.fillna(method = 'ffill')
  218.  
  219. # 19.How to fill Nan value with its next value:
  220. df1 = df.fillna(method = 'bfill')
  221.  
  222. # 20.How do drop all rows with 'None" value
  223. df_jobs = df_jobs.replace(to_replace='None', value=np.nan).dropna()
  224.  
  225.  
  226. ##################################################################################################################################
  227.  # currently married vs who are not currently married
  228. df['DMDMARTL'] = df['DMDMARTL'].map({1:1, 2:0, 3:0, 4:0, 5:0, 6:0})
  229. # have completed college vs don't have completed college
  230. df['DMDEDUC2'] = df['DMDEDUC2'].map({1:0, 2:0, 3:0, 4:1, 5:1})
  231.  
  232. dz = df[['DMDMARTL', 'DMDEDUC2']].dropna() # Unmapped values from above become NA
  233.  
  234.  
  235. ##################################################
  236. # How to plot missing data
  237. plt.figure(figsize = (12,8))
  238. sns.heatmap(titanic.notna())
  239. plt.show()
  240.  
  241.  
  242. ##################################################################################################################################
  243. # How to replace NA values by group-specific Values
  244. mean_age = titanic.age.mean()
  245. titanic.age.fillna(mean_age)
  246.  
  247. titanic['group_mean_age'] = titanic.groupby(['sex', 'pclass']).age.transform('mean')
  248. titanic.age.fillna(titanic.group_mean_age, inplace = True)
  249.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement