Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- def test1():
- df = pd.DataFrame({
- 'col1': [1, 2, 3],
- 'colX': [
- pd.Series([1, 2, 3, None, 0, '']),
- pd.Series([4, 5, 6, None, 0, '']),
- pd.Series([7, 8, 9, None, 0, '']),
- ]
- })
- print('\n--- info ---\n')
- df.info()
- print('\n--- head ---\n')
- print(df.head())
- print('\n--- series dtype ---\n')
- print(df['colX'][0].dtype)
- print('---')
- df = df.astype({
- "col1": float,
- #"colX": float,
- })
- print('\n--- info ---\n')
- df.info()
- print('\n--- head ---\n')
- print(df.head())
- print('\n--- series dtype ---\n')
- print(df['colX'][0].dtype)
- print('---')
- print(round( df.isnull().sum(axis=0)/len(df)*100, 2 ))
- print(round( 100 - df.count()/len(df)*100, 2 ))
- def test2():
- df = pd.DataFrame({
- 'col1': [1, 2, 3],
- 'colX': [
- pd.Series([1, 2, 3, None, 0, '']),
- pd.Series([4, 5, 6, None, 0, '']),
- pd.Series([7, 8, 9, None, 0, '']),
- ]
- })
- # split to columns (and convert to float at the same time)
- #df['X_0'] = df['colX'].str[0]
- #df['X_1'] = df['colX'].str[1]
- #df['X_2'] = df['colX'].str[2]
- #df['X_3'] = df['colX'].str[3]
- df['X_0'] = df['colX'].apply(lambda x:x[0])#.astype(float)
- df['X_1'] = df['colX'].apply(lambda x:x[1])#.astype(float)
- df['X_2'] = df['colX'].apply(lambda x:x[2])#.astype(float)
- df['X_3'] = df['colX'].apply(lambda x:x[3])#.astype(float)
- df['X_4'] = df['colX'].apply(lambda x:x[4])#.astype(float)
- df['X_5'] = df['colX'].apply(lambda x:x[5])#.astype(float)
- # remove column with series
- df.drop(columns='colX', inplace=True)
- #df.info()
- #print(df.head())
- # convert other columns (no need to convert `X_`)
- df = df.astype({
- "col1": float,
- #"X_0": float,
- #"X_1": float,
- #"X_2": float,
- #"X_3": float,
- #"X_4": float,
- #"X_5": float,
- })
- #df.info()
- #print(df.head())
- print(round( df.isnull().sum(axis=0)/len(df)*100, 2 ))
- print(round( 100 - df.count()/len(df)*100, 2 ))
- def test3():
- import pandas as pd
- df = pd.DataFrame({
- 'col1': [1, 2, 3],
- 'colX': [
- pd.Series([1, 2, 3, None, 0, '']),
- pd.Series([4, 5, 6, None, 0, '']),
- pd.Series([7, 8, 9, None, 0, '']),
- ]
- })
- #print(df['colX'][0].dtype)
- #print(df.head())
- #print(round((df.isnull().sum(axis=0)/len(df.index))*100, 2))
- # convert to float values in Series
- #df['colX'] = df['colX'].apply(lambda x: x.astype(float))
- # convert Series to columns with prefix `X_` (need to apply pd.Series even if there are series)
- df = df.assign( **(df['colX'].apply(pd.Series).add_prefix('X_')) )
- # remove column with series
- df.drop(columns='colX', inplace=True)
- #df.info()
- #print(df.head())
- # convert other columns (no need to convert `X_`)
- df = df.astype({
- "col1": float,
- #"X_0": float,
- #"X_1": float,
- #"X_2": float,
- #"X_3": float,
- #"X_4": float,
- #"X_5": float,
- })
- #df.info()
- #print(df.head())
- print(round( df.isnull().sum(axis=0)/len(df)*100, 2 ))
- print(round( 100 - df.count()/len(df)*100, 2 ))
- test1()
- test2()
- test3()
Add Comment
Please, Sign In to add comment