Cleaning data and CORRELATION analysis in Python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp

da = pd.read_csv(
    "C:/Users/eli/Desktop/YtPruboBEemdqA7UJJ_tgg_63e179e3722f4ef783f58ff6e395feb7_nhanes_2015_2016.csv")

# First solution:
# jointplot returns a jointgrid, which we need to assign to a variable in order to add an annotation
# This line is almost like the original, but it seems that fill is needed explicitly now.
# And most importantly, ".annotate" is not just deprecated. It's gone.
jg = sns.jointplot(x='BMXLEG', y='BMXARML', data=da, kind='kde', fill=True)

# To get the correlation, we need to consider only the records with NA values for either measurement.
da_no_nulls = da[['BMXLEG', 'BMXARML']].dropna()
pearsonr, p = sp.stats.pearsonr(da_no_nulls.BMXLEG, da_no_nulls.BMXARML)
pearson_str = f'pearsonr = {pearsonr:.2f}; p = {p}'

# Placing the annotation somewhere readable requires that we find the max of the axes
jg.ax_joint.text(
    jg.ax_joint._axes.xaxis.get_data_interval()[1],
    jg.ax_joint._axes.yaxis.get_data_interval()[1],
    pearson_str,
    horizontalalignment='right')
plt.show()


# Second solution:
# create plot
g = sns.jointplot(x="BMXLEG", y="BMXARML", kind='kde', data=da)

# create mask to omit NaN values from pearson calc
nonan = ~np.logical_or(np.isnan(da.BMXLEG), np.isnan(da.BMXARML))

# calc pearson value and pvalue
r, p = sp.stats.pearsonr(da.BMXLEG[nonan], da.BMXARML[nonan])

# place pearson value
g.ax_joint.annotate(f'$\\rho = {r:.3f}$', xy=(
    0.1, 0.9), xycoords='axes fraction')

plt.show()


# How to create correlation coefficiant for many plots
''' Question 2
Construct a grid of scatterplots between the first systolic and the first diastolic blood pressure measurement.
Stratify the plots by gender (rows) and by race/ethnicity groups (columns).'''

da["RIAGENDR"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})

# drop all nulls
da_no_nulls = da[["BPXDI1", "BPXDI2", "RIAGENDR", "RIDRETH1"]].dropna()

# create plot
g = sns.lmplot(x = "BPXDI1", y = "BPXDI2", data = da_no_nulls, row = "RIAGENDR", col = "RIDRETH1",height=3, aspect=1)

# create function about how to calculate correlation
def annotate(data, **kws):
    r, p = sp.stats.pearsonr(data["BPXDI1"], data["BPXDI2"])
    ax = plt.gca()
    ax.text(.05, .8, 'r={:.2f}, p={:.2g}'.format(r, p),
            transform=ax.transAxes)

# map correlation to every plot
g.map_dataframe(annotate)


##############################################
# Correlation Analysis
df_no_nulls_gasoline = df_gasoline[["Years Automobile", "Price BGN", "Kilometers", "Horsepower"]].dropna()
g = sns.lmplot(x = "Price BGN", y = "Horsepower", data = df_no_nulls_gasoline, height=15, aspect=1)
g.fig.suptitle('Gasoline engines: Price BGN vs Horsepower')
plt.show()


round(np.corrcoef(df['Gross'], df['Rating'])[1][0], 2)

np.round(df.corr(), 2)
sns.heatmap(np.round(df.corr(), 2), annot = True)

sns.heatmap(df.corr(), annot = True)
plt.show()


###################################################################################################################################
# Cleaning Data
import pandas as pd
import numpy as np

# Plot the data set
cars.plot(subplots = True, figsize = (15,12))
plt.show()

# Detecting missing values
titanic.info()
titanic.isna().sum(axis = 0)
titanic.isna().any(axis = 1)
titanic[titanic.isna().any(axis = 1)]
titanic.notna().sum(axis = 0)
titanic.notna().all(axis = 0)
titanic.age.value_counts(dropna = False)

######################################################################################################################
# Visualizate missing values
sns.heatmap(titanic.notna())
plt.show()

def percent_missing(df):
    percent_nan = 100* df.isnull().sum() / len(df)
    percent_nan = percent_nan[percent_nan>0].sort_values()
    return percent_nan
percent_nan = percent_missing(df)
sns.barplot(x=percent_nan.index,y=percent_nan)
plt.xticks(rotation=90);

sns.barplot(x=percent_nan.index,y=percent_nan)
plt.xticks(rotation=90);
plt.ylim(0,1) # Set 1% Threshold

percent_nan[percent_nan < 1]

#######################################################################################################################
my_data = pd.read_excel('D:\student-dropna_1.xlsx', index_col='id')

# NA and Missing are customize missing values
my_data = my_data.replace('NA', np.nan)
my_data = my_data.replace('Missing', np.nan)
my_data = my_date.replace(r'^\s*$', np.nan, regex=True) # replace all blank spaces with np.nan
df_auto = df_auto[df_auto['Year Manifacture'].notna()] # !!!! remove np.nan


# And after that:
# 1a.Drop all rows where we have at least one missing values:
titanic.dropna()

# 1b.Drop all columns where we have at lest one missing values:
titanic.dropna(axis = 1)

# 1c.Drop all rows with missing values where the whole row is with missing value:
titanic.dropna(axis = 0, how = 'all')

#Drop all columns with less than 3000 non-null values
titanic.dropna(axis = 1, thresh = 3000, inplace = True)

# Filter df for all rows with NA-values in the Last_Price column
mask1 = titanic['Last_Price'].isna()
no_price = titanic.loc[mask1]

# 2.Keep only the rows having 2 or more valid data
my_data = my_data.dropna(how='any', axis=0, thresh=2)

# 2a.Drop all rows where we have less than 4 missing values in these four columns:
titanic.dropna(axis = 0, subset = ['Survived', 'Class', 'Gender', 'Age'], tresh = 4)

# 3.Keep only the rows having 3 or more valid data
my_data = my_data.dropna(how='any', axis=0, thresh=3)

# 4.Keep only columns where 11 or more valid data is available
my_data = my_data.dropna(how='any', axis=1, thresh=11)

# 5.Keep only rows where 70% or more valid data is available
my_data = my_data.dropna(how='any', axis=0, thresh=my_data.shape[1]*0.7)

# 6.Keep only columns where 80% or more valid data is available
my_data = my_data.dropna(how='any', axis=1, thresh=my_data.shape[0]*0.8)

# 7.How to take the rows where specific column is not NaN
df = df[df['EPS'].notna()]
df = df.dropna(axis = 'index', how = 'all', subset = ['email', 'lastName']) # the row to be droped all of two have to be missing

# 8.How to fill in missing data in column:
df['column'] = df['column'].fillna(0)
df.fillna(df.mean(), inplace = True ) # only for numerical columns (Step 1)
df.fillna(df.mode().iloc[0], inplace = True) # only for categorical columns (Step 2); iloc[0], because only first row has data and mode

df.groupby('Neighborhood')['Lot Frontage'].transform(lambda val: val.fillna(val.mean())) # fill values after groupby

# 9.How to drop all rows that have missing value in all their columns:
my_data = my_data.dropna(axis='index', how='all')

# 10.How to drop all rows that have at least one missing values in these specific columns:
my_data = my_data.dropna(axis= 0, subset['Age','Color'], how='any')

# 11.How to drop all rows that have less than 2 missing values in these specific columns:
my_data = my_data.dropna(axis= 0, subset['Age','Color'], thresh = 2)

# 12.How to drop all rows that have at least 1 missing values:
my_date = my_data.dropna()


##############################################################################
# 13.Find duplicates rows:
titanic[titanic.duplicated(keep = False)] # HOW TO SEE THE DUPLICATES
df.duplicated(keep = 'first', subset = ['Age', 'Color').sum() # how to count the duplicates in specific columns
df.duplicated()
df.duplicated(keep = 'first').sum() # how to count the duplicates
df[df.duplicated(keep = False)]

# 14.Drop duplicates (be careful!):
df.drop_duplicates(keep = 'first', ignore_index = True, inplace = True ) # how to restore the index
df.drop(index = [495,324,543], inplace = True )
df.drop_duplicates(keep = 'first', subset = ['Name'], inplace = True) # drop duplicates in the specific column

# 15.Find duplicates in the column:
df.duplicated(keep = 'first', subset=['brand'])

# 16.How to see the missing values in the columns:
df.isna().sum() / len(df)

# 17.How to visualize the missing values:
sns.heatmap(df.isnull(), cbar = False)

# 18.How to fill Nan value with its previous value:
df1 = df.fillna(method = 'ffill')

# 19.How to fill Nan value with its next value:
df1 = df.fillna(method = 'bfill')

# 20.How do drop all rows with 'None" value
df_jobs = df_jobs.replace(to_replace='None', value=np.nan).dropna()


##################################################################################################################################
 # currently married vs who are not currently married
df['DMDMARTL'] = df['DMDMARTL'].map({1:1, 2:0, 3:0, 4:0, 5:0, 6:0})
# have completed college vs don't have completed college
df['DMDEDUC2'] = df['DMDEDUC2'].map({1:0, 2:0, 3:0, 4:1, 5:1})

dz = df[['DMDMARTL', 'DMDEDUC2']].dropna() # Unmapped values from above become NA


##################################################
# How to plot missing data
plt.figure(figsize = (12,8))
sns.heatmap(titanic.notna())
plt.show()


##################################################################################################################################
# How to replace NA values by group-specific Values
mean_age = titanic.age.mean()
titanic.age.fillna(mean_age)

titanic['group_mean_age'] = titanic.groupby(['sex', 'pclass']).age.transform('mean')
titanic.age.fillna(titanic.group_mean_age, inplace = True)