Grouping, Aggregating and Transform in Python

# Group by Year
df_year_quantity = df_category_date_quantity.groupby(df_category_date_quantity.TransactionDate.dt.year).agg({'Quantity': 'sum'}).reset_index()
df_year_quantity.head()

# Group by Year and Month
df_category_date_quantity.assign(yr = df_category_date_quantity['TransactionDate'].dt.year, mnth = df_category_date_quantity['TransactionDate'].dt.month).groupby(['yr', 'mnth']).agg({'Quantity': 'sum'}).reset_index()

# Grouping by Category, Year and Month using 'assign'
df_category_year_month_quantity = df_category_date_quantity.assign(category = df_category_date_quantity['ACM Application L2 Ver2'],yr = df_category_date_quantity['TransactionDate'].dt.year, mnth = df_category_date_quantity['TransactionDate'].dt.month).groupby(['category', 'yr', 'mnth']).agg({'Quantity': 'sum'}).reset_index()
df_category_year_month_quantity


####################################################################
directors = df.groupby('director').agg({'title' : 'count', 'revenue_musd' : ['sum', 'mean'], 'vote_count' : 'mean'})
directors

directors.nlargest(20, ('title', 'count'))
directors.nlargest(20, ('revenue_musd', 'sum'))
directors.nlargest(20, ('revenue_musd', 'mean'))
directors.nlargest(20, ('vote_count', 'mean'))

titanic.groupby('sex').agg(survived_total = ('survived_column', 'sum'), survival_rate = ('survived_column', 'mean'), mean_age = ('age', 'mean')) # we don't have multi indexes, we have separаted columns

cars.groupby(['model_year', 'origin']).agg(mean_mpg = ('mpg', 'mean'),
                                           min_mpg = ('mpg', 'min'),
                                           max_mpg = ('mpg', 'max'))

##############################################################
df.groupby('Franchise').budget_musd.mean()
df.groupby('Franchise').revenue_musd.mean().nlargest(n=20)
df.groupby('Franchise').ROI.median()
df.groupby('Franchise').popularity.mean().sort_values(ascending = False)

new_df = df.groupby('Franchise')[['budget_musd', 'ROI']].sum()
new_df.plot(kind = 'bar', subplots = True, figsize = (8,15), fontsize = 13) # plotting after grouping and applying function
plt.show()

################################################################
# if you want a new column in a dataframe for specific groups, use transform method
titanic['group_survived_rate'] = titanic.groupby(['sex','pclass']).survived.transform('mean')
titanic.head()

####################################################################
# Use apply on groups
def five_oldest_survived(group):
    return group[group.survived ==1].nlargest(5, 'age')

titanic.groupby('sex').apply(five_oldest_survived)

def  two_most_fuel_efficient_cars(group):
    return group.nlargest(2, 'mpg').loc[:, ["name", "mpg"]]

df_the_most_fuel_eff_cars = cars.groupby(['model_year', 'origin']).apply(two_most_fuel_efficient_cars)

#####################################################################
# How to expand column after grouping and create visualization
df_year_origin = cars.groupby(['model_year', 'origin']).mpg.mean().unstack().round(2)
cars.groupby(['model_year', 'origin']).mpg.mean().unstack().T # transposing rows and columns

df_year_origin.plot(kind = 'bar',  subplots = True, sharex = False, figsize = (8,17), fontsize = 12)
plt.ylabel('avg_mpg per year')
plt.show()