Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #read the csv file
- titanic<-read.csv('D:/arik.csv',header = TRUE)
- #define a new variable of complete cases of the file
- x<-titanic[complete.cases(titanic),]
- #build a data frame
- df<-data.frame(x$Age,x$Fare)
- #assign age and fare of the ticket in p and q
- p<-x$Age
- q<-x$Fare
- #create t test
- help("t.test")
- ?t.test
- #create a boxplot
- boxplot(p,q)
- #perform two sided t test
- t.test(p,q,mu=0,alt='two.sided',conf.level=0.90,var.equal=FALSE)
- #again read the csv file with a new variable
- titanic1<-read.csv('D:/arik.csv',header=TRUE)
- #remove all NA using complete.cases
- x1<-titanic1[complete.cases(titanic1),]
- #create a new data frame
- df1<-data.frame(x1)
- #create a new subset where stored who was survived
- total_survival<-subset(x1,Survived==1)
- #indentify total survival in the titanic ship
- nrow(total_survival)
- #view the survival data identifying male and female
- View(total_survival$Sex)
- #how many passengers survived on that in incident
- total_passengers<-nrow(x1)
- percentage_of_total_survival<-(nrow(total_survival)/total_passengers)*100
- #how many children survived on that incident
- #buid a new subset
- #consider under 18 everyone is child
- child<-subset(total_survival,Age<18)
- #assign number of total survival and among them total children
- #into two diffrent variables k and m
- k<-nrow(total_survival)
- m<-nrow(child)
- #view the total child servival percentage
- total_childsurvival_percentage<-(m/k)*100
- #do multiple linear regression to see the relation between the explanatory variable
- #and all the independent variables
- #here we denote survival data as a dependent variable and ticket fare,age,passenger class
- #as the independent variables
- help(lm)
- model<-lm(Survived~Age+Fare+Pclass,data = df1)
- #get a summary of the model
- summary(model)
- #plot the regression fit model of the summary model
- plot(model)
- #now we build a anova table
- anova<-aov(Survived~Age+Fare+Pclass,data = df1)
- summary(anova)
- #now we see the confidence interval
- confint(model,level = 0.95)
- #we do correlation to see the relation between two models and also see if this is happen any multicolinearity
- #we denote c1 as column with age
- #we denote d1 as column with fare of the ticket
- c1<-df1$Age
- d1<-df1$Fare
- cor(c1,d1,method = "pearson")
- #now we have to do chi square test of independence
- #that tells you if there is any significant relationship between two nominal variables
- #we have to build a matrix set of two nominal variables
- #we have to denote survival and non survival and male and and female as two nominal variables
- #assign a new variable for converting dummy to categorical variable
- survival_binary<-ifelse(df1$Survived,'Survive','Dead')
- new<-cbind(survival_binary)
- original<-cbind(df1,new)
- #build a new dataframe where new column is added
- df2<-data.frame(original)
- View(df2)
- #now we work with df2 dataset
- gender<-df2$Sex
- survival<-df2$survival_binary
- #build a contingency table
- table(gender,survival)
- help("chisq.test")
- #assign the table into TAB
- TAB=table(gender,survival)
- TAB
- #do barplot of the table
- barplot(TAB,beside = T,legend=T)
- #do chi square test of independence
- CHI=chisq.test(TAB,correct = T)
- CHI
- CHI$expected
- #do fisher test an alternative of chi square distribution
- fisher.test(TAB,conf.int = T,conf.level = 0.95)
- total_survivals<-subset(df2,Survived==1)
- #n1 denote as a number of total survival
- #n2 denote as number of total children survival
- #n4 denote as number of total female survival
- #n5 denote as number of total male survival
- child<-subset(total_survivals,Age<18)
- #number of child survivers
- n2<-nrow(child)
- female<-subset(total_survivals,Sex='female')
- #number of female survivers
- n3<-nrow(female)
- n4<-n3-n2
- male<-n1-n2-n4
- #number of male survivors
- n5<-male
- n1<-nrow(total_survivals)
- number_of_total<-nrow(df2)
- #total dead numbers
- dead1=nrow(df2)-(n1)
- #total survival percentage
- total_surv.=(n1/number_of_total)*100
- #total dead percentafe
- dead=100-(total_surv.)
- #1 male survival prtcng
- total_survivals1<-subset(df2,Survived==1&Sex=='male')
- number_of_male_survive_percentage<-(nrow(total_survivals1)/number_of_total)*100
- #2 female survival prtcng
- total_survivals2<-subset(df2,Survived==1&Sex=='female'& Age>18)
- number_of_female_survive_percentage<-(nrow(total_survivals2)/number_of_total)*100
- #3 child survival prcntg
- total_survivals3<-subset(df2,survival_binary=='Survive'& Age<18)
- number_of_child_survive_percentage<-(nrow(total_survivals3)/number_of_total)*100
- #4 male dead prctng
- total_dead1<-subset(df2,survival_binary=='Dead'&Sex=='male')
- no_of_male_dead_percentage<-(nrow(total_dead1)/dead1)*100
- #5 female dead prtcng
- no_of_female_dead=0
- #6 child dead prtcng
- no_of_child_dead=0
- #make a matrix with dead and survival prcntg
- TAB2<-matrix(c(number_of_male_survive_percentage,number_of_female_survive_percentage,number_of_child_survive_percentage,no_of_male_dead_percentage,no_of_female_dead,no_of_child_dead),ncol = 2)
- colnames(TAB2)<-c("Total Survival Percentage","Total Dead Percentage")
- row.names(TAB2)<-c("Male","Female","Child")
- #transform the matrix into a table
- TAB2<-as.table(TAB2)
- #making a pie chart
- require(RColorBrewer)
- ?brewer.pal
- indices<-TAB2[,1]!=0
- par(mar=c(1,4,4,1))
- pie(TAB2[,1],labels = row.names(TAB2),col = brewer.pal(length(TAB2[,1]!=0),'Spectral'),main = 'Survival Rate by Gender')
- legend("topleft",legend =row.names(TAB2),fill = brewer.pal(length(TAB2[,1]),'Spectral'))
- #identify the percentage of children,female and male survival percentage
- child_percentage<-(n2/n1)*100
- female_percentage<-(n4/n1)*100
- male_percentage<-(n5/n1)*100
- #create a new dataframe
- new_dataframe<-data.frame(child_percentage,female_percentage,male_percentage)
- #view the dataframe
- View(new_dataframe)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement