Final R statistical analysis for Titanic Dataset

#read the csv file
titanic<-read.csv('D:/arik.csv',header = TRUE)
#define a new variable of complete cases of the file
x<-titanic[complete.cases(titanic),]
#build a data frame
df<-data.frame(x$Age,x$Fare)
#assign age and fare of the ticket in p and q
p<-x$Age
q<-x$Fare
#create t test
help("t.test")
?t.test
#create a boxplot
boxplot(p,q)
#perform two sided t test
t.test(p,q,mu=0,alt='two.sided',conf.level=0.90,var.equal=FALSE)

#again read the csv file with a new variable
titanic1<-read.csv('D:/arik.csv',header=TRUE)
#remove all NA using complete.cases
x1<-titanic1[complete.cases(titanic1),]
#create a new data frame
df1<-data.frame(x1)
#create a new subset where stored who was survived
total_survival<-subset(x1,Survived==1)
#indentify total survival in the titanic ship
nrow(total_survival)
#view the survival data identifying male and female
View(total_survival$Sex)

#how many passengers survived on that in incident
total_passengers<-nrow(x1)
percentage_of_total_survival<-(nrow(total_survival)/total_passengers)*100

#how many children survived on that incident
#buid a new subset
#consider under 18 everyone is child
child<-subset(total_survival,Age<18)
#assign number of total survival and among them total children
#into two diffrent variables k and m
k<-nrow(total_survival)
m<-nrow(child)
#view the total child servival percentage
total_childsurvival_percentage<-(m/k)*100


#do multiple linear regression to see the relation between the explanatory variable
#and all the independent variables
#here we denote survival data as a dependent variable and ticket fare,age,passenger class
#as the independent variables


help(lm)
model<-lm(Survived~Age+Fare+Pclass,data = df1)
#get a summary of the model
summary(model)
#plot the regression fit model of the summary model
plot(model)

#now we build a anova table
anova<-aov(Survived~Age+Fare+Pclass,data = df1)
summary(anova)

#now we see the confidence interval
confint(model,level = 0.95)


#we do correlation to see the relation between two models and also see if this is happen any multicolinearity
#we denote c1 as column with age
#we denote d1 as column with fare of the ticket
c1<-df1$Age
d1<-df1$Fare
cor(c1,d1,method = "pearson")

#now we have to do chi square test of independence
#that tells you if there is any significant relationship between two nominal variables
#we have to build a matrix set of two nominal variables
#we have to denote survival and non survival and male and and female as two nominal variables
#assign a new variable for converting dummy to categorical variable
survival_binary<-ifelse(df1$Survived,'Survive','Dead')
new<-cbind(survival_binary)
original<-cbind(df1,new)

#build a new dataframe where new column is added
df2<-data.frame(original)
View(df2)


#now we work with df2 dataset
gender<-df2$Sex
survival<-df2$survival_binary
#build a contingency table
table(gender,survival)
help("chisq.test")

#assign the table into TAB
TAB=table(gender,survival)
TAB

#do barplot of the table
barplot(TAB,beside = T,legend=T)

#do chi square test of independence
CHI=chisq.test(TAB,correct = T)
CHI
CHI$expected

#do fisher test an alternative of chi square distribution
fisher.test(TAB,conf.int = T,conf.level = 0.95)


total_survivals<-subset(df2,Survived==1)
#n1 denote as a number of total survival
#n2 denote as number of total children survival
#n4 denote as number of total female survival
#n5 denote as number of total male survival


child<-subset(total_survivals,Age<18)
#number of child survivers
n2<-nrow(child)
female<-subset(total_survivals,Sex='female')
#number of female survivers
n3<-nrow(female)
n4<-n3-n2
male<-n1-n2-n4
#number of male survivors
n5<-male

n1<-nrow(total_survivals)
number_of_total<-nrow(df2)

#total dead numbers
dead1=nrow(df2)-(n1)

#total survival percentage
total_surv.=(n1/number_of_total)*100
#total dead percentafe
dead=100-(total_surv.)


#1 male survival prtcng
total_survivals1<-subset(df2,Survived==1&Sex=='male')
number_of_male_survive_percentage<-(nrow(total_survivals1)/number_of_total)*100

#2 female survival prtcng
total_survivals2<-subset(df2,Survived==1&Sex=='female'& Age>18)
number_of_female_survive_percentage<-(nrow(total_survivals2)/number_of_total)*100

#3 child survival prcntg
total_survivals3<-subset(df2,survival_binary=='Survive'& Age<18)
number_of_child_survive_percentage<-(nrow(total_survivals3)/number_of_total)*100


#4 male dead prctng
total_dead1<-subset(df2,survival_binary=='Dead'&Sex=='male')
no_of_male_dead_percentage<-(nrow(total_dead1)/dead1)*100

#5 female dead prtcng
no_of_female_dead=0

#6 child dead prtcng

no_of_child_dead=0


#make a matrix with dead and survival prcntg

TAB2<-matrix(c(number_of_male_survive_percentage,number_of_female_survive_percentage,number_of_child_survive_percentage,no_of_male_dead_percentage,no_of_female_dead,no_of_child_dead),ncol = 2)
colnames(TAB2)<-c("Total Survival Percentage","Total Dead Percentage")
row.names(TAB2)<-c("Male","Female","Child")

#transform the matrix into a table

TAB2<-as.table(TAB2)


#making a pie chart

require(RColorBrewer)
?brewer.pal

indices<-TAB2[,1]!=0
par(mar=c(1,4,4,1))
pie(TAB2[,1],labels = row.names(TAB2),col = brewer.pal(length(TAB2[,1]!=0),'Spectral'),main = 'Survival Rate by Gender')
legend("topleft",legend =row.names(TAB2),fill = brewer.pal(length(TAB2[,1]),'Spectral'))


#identify the percentage of children,female and male survival percentage

child_percentage<-(n2/n1)*100
female_percentage<-(n4/n1)*100
male_percentage<-(n5/n1)*100
#create a new dataframe
new_dataframe<-data.frame(child_percentage,female_percentage,male_percentage)
#view the dataframe
View(new_dataframe)