Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(tidyr)
- library(ggplot2)
- library(dplyr)
- library(plyr)
- library(tidyverse) #
- #import the data
- employee <- read.csv(file="c:\\Users\\Asus\\OneDrive\\Desktop\\employee_attrition.csv",header=TRUE,sep=",")
- #different viewing methods
- #for first 6 lines
- head(employee)
- head(employee,10)
- #for last 6 lines
- tail(employee)
- tail(employee,10)
- str(employee)
- summary(employee)
- head(employee)
- View(employee)#View the dataset in table form
- names(employee)=c("Employee_ID(PK)","Record_Date","Birthday_Date","Orighire_Date","Termination_Date","Age","Length_of_Service","City","Department","Job_Title","Store_Number","Gender","Gender_Full","Term_desc","Type_term","Status_Year","Status","Business_unit")
- names(employee)
- #Data Cleaning
- employee$Gender_Full <- NULL
- employee$Store_Number <-NULL
- employee$Birthday_Date <-NULL
- employee$Type_Term <-NULL
- employee$Orighire_Date <- NULL
- View(employee)
- #how data stores
- class(employee)
- length(employee)#number of column
- ncol(employee)
- nrow(employee)#number of row
- summary(employee)
- #What is the largest in this company
- max(employee$Age) #65
- #what is the smallest in this company
- min(employee$Age) #19
- #Who is the longest length of services in this company?
- max(employee$Length_of_Service) #26
- #Who is the smallest length of services in this company?
- min(employee$Length_of_Service) #0
- employee[employee$Gender=="Male",] #writing condition with categorical data
- male=employee[(employee$Age>60)&(employee$Gender=="M"),]
- nrow(male)
- View(male)
- #How many employee is terminated
- Number_Terminated=employee[employee$Status=="TERMINATED",]
- #How many employee is worked more than 15 years and terminated
- Number_Terminated_1=employee[(employee$`Length of Service`>14)&(employee$Status=="TERMINATED"),]
- nrow(Terminated)
- View(Terminated)
- nrow(Number_Terminated_1)
- View(Number_Terminated_1)
- Female=employee[(employee$Age>60)&(employee$Gender=="F"),]
- nrow(Female)
- View(Female)
- #Categorize the Length of services
- factor(employee$Length_of_Service)#list out the categories
- nlevels(factor(employee$Length_of_Service)) #how many level
- View(employee)
- statusCount<-as.data.frame.matrix(employee %>%
- group_by(Status_Year) %>%
- select(Status) %>%
- table())
- statusCount$Total<-statusCount$ACTIVE+statusCount$TERMINATED
- statusCount$PercentTerminate<-statusCount$TERMINATED/(statusCount$Total)*100
- statusCount
- mean(statusCount$PercentTerminate)
- #Analysis part
- #just terminates
- #statusCount<-as.data.frame.matrix(empset %>%
- # group_by(Status) %>%
- # select(STATUS) %>%
- #table())
- #statusCount$Total<-statusCount$ACTIVE+statusCount$TERMINATED
- #statusCount$PercentTerminate<-statusCount$TERMINATED/(statusCount$Total)*100
- #statusCount
- TerminateData<- employee %>% filter(Status=="TERMINATED")
- ggplot(TerminateData)+geom_bar(aes(x=Status_Year, fill=Term_desc))
- ggplot(TerminateData)+geom_bar(aes(x=Length_of_Service, fill=Status))
- ggplot(TerminateData)+geom_bar(aes(x=as.factor(Department), fill=as.factor(Term_desc)))+
- theme(axis.text.x = element_text(angle=90, hjust=1,vjust=0.5))
- AgeData<- employee %>% filter(Age <40 & Status=="TERMINATED")
- ggplot(AgeData)+geom_bar(aes(x=Age, fill=Term_desc))
- ggplot(data=employee,aes(x=Age,y=Length_of_Service)) + geom_line()
- #Scatter plot
- #To show older will work how long for the services
- ggplot(employee, aes(x=Age,y=Length_of_Service, color=Age)) + geom_point() + xlim(10,60)+ ylim(0,25)
- library(caret)
- featurePlot(x=MYdataset[,6:7],y=MYdataset$STATUS,plot="density",auto.key = list(columns = 2))
- View(employee)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement