Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import openpyxl as pxl
- import seaborn as sns
- import statistics
- from collections import OrderedDict
- from dateutil.relativedelta import relativedelta
- from datetime import datetime
- import textwrap
- def get_val(df):
- x = 0
- for i in df:
- x = i
- return x
- def graph_id_rating(df):
- df = pd.DataFrame(df.groupby(['рейтинг', 'флаг']).count()).reset_index().set_index('рейтинг')
- df.drop('report_date', inplace=True, axis=1)
- df = df.pivot(columns='флаг')
- return df
- #function for create df year per id
- def graph_year_id(df):
- df.drop('флаг', inplace=True, axis=1)
- df.drop('рейтинг', inplace=True, axis=1)
- df = pd.DataFrame(df.groupby(['report_date']).count())
- df = df.reset_index()
- df['report_date'] = pd.DatetimeIndex(df['report_date']).year
- df = df.set_index('report_date')
- return df
- # 1y
- # 2y
- # 3m
- # {'scale': 'month', 'scale_factor': 4}
- # {'scale': 'year', 'scale_factor': 2}
- # {'scale': 'quartal', 'scale_factor': 1 or 2}
- #function for different check
- #TODO check solution using udf functions, not iterate over dicts
- def step_graph(df, step):
- year_dict = {}
- temp_2 = {}
- temp = {}
- for date, i in df.iterrows():
- if i[1] not in year_dict:
- year_dict[i[1]] = []
- temp_2[i[1]] = []
- year_dict[i[1]].append(i[0].strftime('%Y-%m'))
- value_get = ''
- for key, values in year_dict.items():
- value_get = year_dict[key][0]
- while value_get < year_dict[key][len(year_dict[key]) - 1]:
- # TODO CHECK WHILE OR FOR TO ITERATE OVER THE ALL DATES
- if step['scale'] == 'month':
- value_get = datetime.strptime(value_get, '%Y-%m')
- value_get = value_get + (relativedelta(months=step['scale_factor']))
- value_get = value_get.strftime('%Y-%m')
- elif step['scale'] == 'year':
- value_get = datetime.strptime(value_get, '%Y-%m')
- value_get = value_get + (relativedelta(years=step['scale_factor']))
- value_get = value_get.strftime('%Y-%m')
- elif step['scale'] == "quartal":
- value_get = datetime.strptime(value_get, '%Y-%m')
- value_get = value_get + (relativedelta(months=step['scale_factor'] * 3)) #todo redact this step
- value_get = value_get.strftime('%Y-%m')
- if value_get in values:
- temp_2[key].append(value_get)
- check = []
- df_1 = pd.DataFrame(columns=df.columns)
- df_1 = df_1.dropna()
- for valeu in temp_2.values():
- check.append(len(valeu))
- for x in check:
- if x not in temp:
- temp[x] = []
- temp[x].append(check.count(x))
- for key, value in temp.items():
- df_1.at[key] = len(value)
- print(key, len(value))
- df_1.drop('report_date', inplace=True, axis=1)
- df_1 = df_1.sort_index()
- print(df_1)
- return df_1
- #reading excel file
- step = {'scale': 'year', 'scale_factor': 2}
- df = pd.read_excel("cases_sample_rebuilt.xlsx", sheet_name="Лист3")
- year_dict = {} #dict for create df
- temp = [] #mass for count number of id/years/etc
- val = [] #mass for median, mean, etc
- for date, i in df.iterrows():
- if i[1] not in year_dict:
- year_dict[i[1]] = []
- temp.append(i[0].strftime('%Y'))
- year_dict[i[1]].append(i[0].strftime('%Y'))
- df_1 = pd.DataFrame(columns=df.columns) #creating df
- df_1 = df_1.dropna()
- for key, value in year_dict.items():
- df_1.at[key] = len(set(temp)) - len(value) #key --- (value = count of id)
- val.append(len(set(temp)) - len(value))
- print(year_dict)
- med = get_val(df_1.median()) #median
- mean = get_val(df_1.mean()) #mean
- mod = (statistics.mode(val)) #mod
- sted = get_val(df_1.std()) #std
- types = ['density', 'count', 'percent', 'frequency', 'dependence id-year', 'dependence year-id', 'step'] #todo add step checker
- for i in types:
- fig = plt.figure(figsize=(10, 10))
- if i == 'dependence id-year':
- df_1 = graph_id_rating(df)
- df_1.plot.bar(stacked=True)
- plt.xlabel('Dates')
- plt.ylabel('number of unique objects')
- elif i == 'dependence year-id':
- df_1 = graph_year_id(df)
- df_1.plot.bar()
- plt.ylabel('number of unique objects at date')
- plt.xlabel('Number of dates')
- elif i == 'step' :
- df_1 = step_graph(df, step)
- df_1.plot.bar()
- plt.ylabel('number of unique objects')
- else:
- sns.histplot(data=df_1, kde=True,bins = list(range(0, len(set(temp)), 1)), stat=i, legend=False)
- plt.xlabel(
- f'median:{med}, mod:{mod}, mean:{round(mean, 2)}, sted:{round(sted, 2)}'
- , labelpad=20, fontsize=5, loc='center')
- plt.savefig(f'picture{i}', dpi=350, bbox_inches='tight')
- plt.close(fig)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement