Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ##Faktoranalízis R-ben
- ##Daniel Kuknyo
- library(ggplot2)
- library(ggfortify)
- library(psych)
- library(nFactors)
- library(FactoMineR)
- ##Read into DataFrame
- mydata <- read.csv('R\\2017.csv', header=TRUE, sep=',')
- ##Reshape DataFrame
- rows <- mydata$Country
- removecol <- c("Whisker.high","Whisker.low","Country","Happiness.Rank","Happiness.Score")
- pcdata <- mydata[ , !names(mydata) %in% removecol]
- rownames(pcdata) <- rows
- ##Principal Component Analysis
- pca <- prcomp(pcdata, scale=TRUE)
- plot(pca$x[,1], pca$x[,2])
- ##Plot with eigenvectors
- autoplot(pca, loadings=TRUE, loadings.colour='blue', loadings.label=TRUE,
- loadings.label.size=5)
- ##Calculate how much the Variations Account for
- pca.var <- pca$sdev^2
- pca.var.per <- round(pca.var / sum(pca.var)*100, 1)
- barplot(pca.var.per, main="Scree Plot", xlab = "Principal Component",
- ylab = "Percent Variation",)
- ##GGplot Graph from the Data
- pca.data <- data.frame(Sample = rownames(pca$x),
- X = pca$x[,1],
- Y = pca$x[,2])
- ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) +
- geom_text() + ##plot labels instead of circles
- xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
- ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
- theme_bw() +
- ggtitle("Principle Component Diagram")
- ##Determine the loading scores to determine PC1 and PC2
- loading_scores <- pca$rotation[,1]
- country_scores <- abs(loading_scores)
- country_score_ranked <- sort(country_scores, decreasing = TRUE)
- top_countries <- names(country_score_ranked)
- pca$rotation[top_countries,1] ##show scores and sign
- ##MDS and PCoA----------------------------------------------------------------------------
- ##Create a distance matrix with scaled distances and euclidean distance
- distance.matrix <- dist(scale(pcdata, center=TRUE, scale=TRUE),
- method="euclidean")
- ##Perform multi-dimensional scaling on the distance matrix
- mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE)
- ##Calculate how much variation each axis accounts for in the MDS plot
- mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
- ##Format the data for ggplot
- mds.values <- mds.stuff$points
- mds.data <- data.frame(Sample=rownames(mds.values),
- X=mds.values[,1],
- Y=mds.values[,2])
- ##Plot MDS using any metric specified as parameter
- plotter <- function(metric){
- ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) +
- geom_text() +
- theme_bw() +
- xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
- ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
- ggtitle(paste("MDS plot using ", metric, " distance", sep=""))
- }
- plotter("euclidean")
- ##Calculate distance matrix using the absolute value of the log fold change---------------
- ##Calculate log2 values of measurement of samples
- mdsdata <- pcdata
- ##Remove columns containing zeros for matrix creation
- mdsdata <- pcdata
- col_sub <- apply(mdsdata, 1, function(col) all(col!=0))
- mdsdata <- mdsdata[col_sub,]
- mdlogdata <- t(mdsdata)
- log2.data.matrix <- log2(mdlogdata)
- ##Create empty matrix
- log2.distance.matrix <- matrix(0,
- nrow = ncol(log2.data.matrix),
- ncol = ncol(log2.data.matrix),
- dimnames = list(colnames(log2.data.matrix),
- colnames(log2.data.matrix)))
- ##Fill matrix with the average values of log-fold changes
- for(i in 1:ncol(log2.distance.matrix)) {
- for(j in 1:i) {
- log2.distance.matrix[i, j] <-
- mean(abs(log2.data.matrix[,i] - log2.data.matrix[,j]))
- }
- }
- ##Multi-dimensional scaling on the distance matrix
- mds.stuff <- cmdscale(as.dist(log2.distance.matrix), eig=TRUE, x.ret=TRUE)
- ##Calculate amount of variation each axis accounts for
- mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
- ##Format for ggplot
- mds.values <- mds.stuff$points
- mds.data <- data.frame(Sample=rownames(mds.values),
- X=mds.values[,1],
- Y=mds.values[,2])
- ##Create graph using ggplot
- ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) +
- geom_text() +
- theme_bw() +
- xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
- ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
- ggtitle("MDS plot using avg(logFC) as the distance")
- ##Factor analysis-------------------------------------------------------------------------
- ##Determining number of factors to Extract
- ev <- eigen(cor(pcdata)) ##get eigenvalues
- ap <- parallel(subject=nrow(pcdata), var=ncol(pcdata), rep=100, cent=.05)
- nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
- plotnScree(nS)
- ##Principal factor analysis
- fit <- fa(pcdata, nfactors=3, rotate="varimax")
- print(fit)
- ##Maximum likelihood 2 factor analysis with varimax rotation
- fit <- factanal(pcdata, 2, rotation="varimax")
- print(fit, digits=2, cutoff=.3, sort=TRUE)
- ##Plot by factor1 and factor2
- load <- fit$loadings[,1:2]
- plot(load, type="n", col='black')
- text(load, labels=names(pcdata), cex=.7, col='black')
- result <- PCA(pcdata)
- ##Plotting factor analysis
- d.factanal <- factanal(pcdata, factors = 3, scores = 'regression')
- autoplot(d.factanal, label=TRUE, label.size=3,
- loadings=TRUE, loadings.label=TRUE, loadings.label.size=3)
- autoplot(d.factanal, data=pcdata, label=TRUE, colour='Economy..GDP.per.Capita.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement