Advertisement
BasicTask

Faktoranalízis R

Feb 10th, 2020
581
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 5.22 KB | None | 0 0
  1. ##Faktoranalízis R-ben
  2. ##Daniel Kuknyo
  3.  
  4. library(ggplot2)
  5. library(ggfortify)
  6. library(psych)
  7. library(nFactors)
  8. library(FactoMineR)
  9.  
  10.  
  11. ##Read into DataFrame
  12. mydata <- read.csv('R\\2017.csv', header=TRUE, sep=',')
  13.  
  14. ##Reshape DataFrame
  15. rows <- mydata$Country
  16. removecol <- c("Whisker.high","Whisker.low","Country","Happiness.Rank","Happiness.Score")
  17. pcdata <- mydata[ , !names(mydata) %in% removecol]
  18. rownames(pcdata) <- rows
  19.  
  20.  
  21. ##Principal Component Analysis
  22. pca <- prcomp(pcdata, scale=TRUE)
  23. plot(pca$x[,1], pca$x[,2])
  24.  
  25.  
  26. ##Plot with eigenvectors
  27. autoplot(pca, loadings=TRUE, loadings.colour='blue', loadings.label=TRUE,
  28.          loadings.label.size=5)
  29.  
  30.  
  31. ##Calculate how much the Variations Account for
  32. pca.var <- pca$sdev^2
  33. pca.var.per <- round(pca.var / sum(pca.var)*100, 1)
  34. barplot(pca.var.per, main="Scree Plot", xlab = "Principal Component",
  35.         ylab = "Percent Variation",)
  36.  
  37.  
  38. ##GGplot Graph from the Data
  39. pca.data <- data.frame(Sample = rownames(pca$x),
  40.                        X = pca$x[,1],
  41.                        Y = pca$x[,2])
  42.  
  43. ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) +
  44.   geom_text() + ##plot labels instead of circles
  45.   xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
  46.   ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
  47.   theme_bw() +
  48.   ggtitle("Principle Component Diagram")
  49.  
  50.  
  51. ##Determine the loading scores to determine PC1 and PC2
  52. loading_scores <- pca$rotation[,1]
  53. country_scores <- abs(loading_scores)
  54. country_score_ranked <- sort(country_scores, decreasing = TRUE)
  55. top_countries <- names(country_score_ranked)
  56.  
  57. pca$rotation[top_countries,1] ##show scores and sign
  58.  
  59. ##MDS and PCoA----------------------------------------------------------------------------
  60. ##Create a distance matrix with scaled distances and euclidean distance
  61. distance.matrix <- dist(scale(pcdata, center=TRUE, scale=TRUE),
  62.   method="euclidean")
  63.  
  64. ##Perform multi-dimensional scaling on the distance matrix
  65. mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE)
  66.  
  67. ##Calculate how much variation each axis accounts for in the MDS plot
  68. mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
  69.  
  70. ##Format the data for ggplot
  71. mds.values <- mds.stuff$points
  72. mds.data <- data.frame(Sample=rownames(mds.values),
  73.                        X=mds.values[,1],
  74.                        Y=mds.values[,2])
  75.  
  76. ##Plot MDS using any metric specified as parameter  
  77. plotter <- function(metric){
  78.   ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) +
  79.     geom_text() +
  80.     theme_bw() +
  81.     xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
  82.     ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
  83.     ggtitle(paste("MDS plot using ", metric, " distance", sep=""))
  84. }
  85. plotter("euclidean")
  86.  
  87. ##Calculate distance matrix using the absolute value of the log fold change---------------
  88. ##Calculate log2 values of measurement of samples
  89. mdsdata <- pcdata
  90.  
  91. ##Remove columns containing zeros for matrix creation
  92. mdsdata <- pcdata
  93. col_sub <- apply(mdsdata, 1, function(col) all(col!=0))
  94. mdsdata <- mdsdata[col_sub,]
  95.  
  96. mdlogdata <- t(mdsdata)
  97. log2.data.matrix <- log2(mdlogdata)
  98.  
  99. ##Create empty matrix
  100. log2.distance.matrix <- matrix(0,
  101.   nrow = ncol(log2.data.matrix),
  102.   ncol = ncol(log2.data.matrix),
  103.   dimnames = list(colnames(log2.data.matrix),
  104.     colnames(log2.data.matrix)))
  105.  
  106.  
  107. ##Fill matrix with the average values of log-fold changes
  108. for(i in 1:ncol(log2.distance.matrix)) {
  109.   for(j in 1:i) {
  110.     log2.distance.matrix[i, j] <-
  111.       mean(abs(log2.data.matrix[,i] - log2.data.matrix[,j]))
  112.   }
  113. }
  114.  
  115. ##Multi-dimensional scaling on the distance matrix
  116. mds.stuff <- cmdscale(as.dist(log2.distance.matrix), eig=TRUE, x.ret=TRUE)
  117.  
  118. ##Calculate amount of variation each axis accounts for
  119. mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
  120.  
  121. ##Format for ggplot
  122. mds.values <- mds.stuff$points
  123. mds.data <- data.frame(Sample=rownames(mds.values),
  124.   X=mds.values[,1],
  125.   Y=mds.values[,2])
  126.  
  127. ##Create graph using ggplot
  128. ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) +
  129.   geom_text() +
  130.   theme_bw() +
  131.   xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
  132.   ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
  133.   ggtitle("MDS plot using avg(logFC) as the distance")
  134.  
  135.  
  136. ##Factor analysis-------------------------------------------------------------------------
  137. ##Determining number of factors to Extract
  138. ev <- eigen(cor(pcdata)) ##get eigenvalues
  139. ap <- parallel(subject=nrow(pcdata), var=ncol(pcdata), rep=100, cent=.05)
  140. nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
  141. plotnScree(nS)
  142.  
  143. ##Principal factor analysis
  144. fit <- fa(pcdata, nfactors=3, rotate="varimax")
  145. print(fit)
  146.  
  147. ##Maximum likelihood 2 factor analysis with varimax rotation
  148. fit <- factanal(pcdata, 2, rotation="varimax")
  149. print(fit, digits=2, cutoff=.3, sort=TRUE)
  150.  
  151. ##Plot by factor1 and factor2
  152. load <- fit$loadings[,1:2]
  153. plot(load, type="n", col='black')
  154. text(load, labels=names(pcdata), cex=.7, col='black')  
  155.  
  156. result <- PCA(pcdata)
  157.  
  158. ##Plotting factor analysis
  159. d.factanal <- factanal(pcdata, factors = 3, scores = 'regression')
  160.  
  161. autoplot(d.factanal, label=TRUE, label.size=3,
  162.          loadings=TRUE, loadings.label=TRUE, loadings.label.size=3)
  163.  
  164. autoplot(d.factanal, data=pcdata, label=TRUE, colour='Economy..GDP.per.Capita.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement