Advertisement
backlight0815

Untitled

Jul 29th, 2022
568
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.22 KB | None | 0 0
  1. ## Importing packages
  2.  
  3. # This R environment comes with all of CRAN and many other helpful packages preinstalled.
  4. # You can see which packages are installed by checking out the kaggle/rstats docker image:
  5. # https://github.com/kaggle/docker-rstats
  6.  
  7. library(tidyverse) # metapackage with lots of helpful functions
  8.  
  9. ## Running code
  10.  
  11. # In a notebook, you can run a single code cell by clicking in the cell and then hitting
  12. # the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script,
  13. # you can run code by highlighting the code you want to run and then clicking the blue arrow
  14. # at the bottom of this window.
  15.  
  16. ## Reading in files
  17.  
  18. # You can access files from datasets you've added to this kernel in the "../input/" directory.
  19. # You can see the files added to this kernel by running the code below.
  20.  
  21. empset <- read.csv("../input/MFG10YearTerminationData.csv")
  22. ## Saving data
  23.  
  24. # If you save any files or images, these will be put in the "output" directory. You
  25. # can see the output directory by committing and running your kernel (using the
  26. # Commit & Run button) and then checking out the compiled version of your kernel.
  27.  
  28. #setwd("D:/Kriti/practice/employee-attrition-data")
  29. #empset<-read.csv("D:/Kriti/practice/employee-attrition-data/MFG10YearTerminationData.csv")
  30.  
  31. str(empset)
  32. #View(empset)
  33.  
  34. summary(empset)
  35.  
  36. library(tidyr)
  37. library(ggplot2)
  38. library(dplyr)
  39. library(plyr)
  40.  
  41. statusCount<-as.data.frame.matrix(empset %>%
  42.                        group_by(STATUS_YEAR) %>%
  43.                        select(STATUS) %>%
  44.                        table())
  45.  
  46. statusCount$Total<-statusCount$ACTIVE+statusCount$TERMINATED
  47.  
  48. statusCount$PercentTerminate<-statusCount$TERMINATED/(statusCount$Total)*100
  49.  
  50.  
  51. statusCount
  52.  
  53. mean(statusCount$PercentTerminate)
  54.  
  55.  
  56. ggplot(empset)+geom_bar(aes(x=BUSINESS_UNIT, fill=STATUS))
  57.  
  58. TerminateData<- empset %>% filter(STATUS=="TERMINATED")
  59.  
  60. ggplot(TerminateData)+geom_bar(aes(x=STATUS_YEAR, fill=termtype_desc))
  61.  
  62. ggplot(TerminateData)+geom_bar(aes(x=STATUS_YEAR, fill=termreason_desc))
  63.  
  64. ggplot(TerminateData)+geom_bar(aes(x=as.factor(department_name), fill=as.factor(termreason_desc)))+
  65.                                theme(axis.text.x = element_text(angle=90, hjust=1,vjust=0.5))
  66.  
  67.  
  68. library(caret)
  69.  
  70. featurePlot(x=empset[,6:7],y=empset$STATUS,plot="density",auto.key=list(columns=2))
  71.  
  72. featurePlot(x=empset[,6:7],y=empset$STATUS,plot="box",auto.key=list(columns=2))
  73.  
  74. library(rattle)
  75. library(magrittr)
  76. library(randomForest)
  77. crv$seed=42
  78.  
  79. set.seed(crv$seed)
  80.  
  81. empNum<-nrow(empset)
  82.  
  83. empTrain<-subset(empset,STATUS_YEAR<2015)
  84. empTest<-subset(empset,STATUS_YEAR==2015)
  85.  
  86.  
  87. MYinput= c("age", "length_of_service",    "gender_full", "STATUS_YEAR", "BUSINESS_UNIT")
  88. MYnumeric= c("age", "length_of_service", "STATUS_YEAR")
  89. MYcategoric = c("gender_full", "BUSINESS_UNIT")
  90. MYtarget= "STATUS"
  91. MYident = "EmployeeID"
  92. MYTrainingData<-empTrain[c(MYinput, MYtarget)]
  93. MYTestData<-empTest[c(MYinput, MYtarget)]
  94.  
  95.  
  96. library(rpart)
  97.  
  98. myrpart<-rpart(STATUS~.,
  99.                data=empTrain[,c(MYinput,MYtarget)],
  100.                method="class",
  101.                parms = list(split="information"),
  102.                control=rpart.control(usesurrogate = 0, maxsurrogate = 0))
  103. myrpart
  104. print(myrpart)
  105. printcp(myrpart)
  106. fancyRpartPlot(myrpart, main="Decision Tree MFG10YearTerminationData $ STATUS")
  107.  
  108.  
  109. #predict using decision train
  110. mydt<-predict(myrpart, data=empTest[c(MYinput, MYtarget)], type="class")
  111.  
  112. #confusion matrix to check
  113. #confusionMatrix(data=mydt, reference = empTest$STATUS,positive = "Yes", mode = "prec_recall")
  114. #error
  115.  
  116.  
  117. #table(empTest[c(MYinput,MYtarget)]$STATUS, mydt, dnn = c("Actual","Predicted"))
  118.  
  119. set.seed(crv$seed)
  120.  
  121. myrf<-randomForest(STATUS~.,
  122.                    data=empTrain[,c(MYinput,MYtarget)],
  123.                    ntree=500,
  124.                    mtry=2,
  125.                    importance=TRUE,
  126.                    na.action = randomForest::na.roughfix,
  127.                    replace=FALSE)
  128. myrf
  129.  
  130.  
  131.  
  132. pROC::roc(myrf$y,as.numeric(myrf$predicted))
  133.  
  134.  
  135. pROC::ci.auc(myrf$y,as.numeric(myrf$predicted))
  136.  
  137. rn<-round(randomForest::importance(myrf),2)
  138. rn[order(rn[,3],decreasing = TRUE),]
  139.  
  140. varImpPlot(myrf, type = 1, main="Variable importance", sub="random forest model")
  141.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement