Advertisement
HerrBear

CARE

Jan 22nd, 2024 (edited)
1,539
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 7.55 KB | Software | 0 0
  1. F_M <- c(0.0, 0.0, 0.8, 6.6, 24.1, 54.2, 78.7, 92.6, 97.9, 100)
  2. F_F <- c(3.0, 14.1, 36.2, 65.7, 88.9, 97.1, 99.9, 100, 100, 100)
  3. P_M <- c(F_M[1], diff(F_M))
  4. P_F <- c(F_F[1], diff(F_F))
  5. Height_range <- c(150, 155, 160, 165, 170, 175, 180, 185, 190, 195)
  6. H_df = data.frame(Height_range, F_M, F_F, P_M/100, P_F/100)
  7.  
  8. library(dplyr)
  9. H_df <- H_df |> mutate(H=Height_range-5, F_X=cumsum((P_M + P_F)*0.5), P_X = (P_M.100 + P_F.100)*0.5, H_X = H*P_X, H_M = H*P_M.100, H_F = H*P_F.100)
  10.  
  11. esp <- function(xs, pxs) {
  12.   return (sum(xs*pxs))
  13. }
  14.  
  15. mean_F <- esp(H_df[, "H"], H_df[, "P_F.100"])
  16. mean_M <- esp(H_df[, "H"], H_df[, "P_M.100"])
  17. mean_X <- esp(H_df[, "H"], H_df[, "P_X"])
  18.  
  19. my_sd <- function(xs, pxs) {
  20.   return (sqrt(esp(xs^2, pxs) - esp(xs, pxs)^2))
  21. }
  22.  
  23. sd_F <- my_sd(H_df[, "H"], H_df[, "P_F.100"])
  24. sd_M <- my_sd(H_df[, "H"], H_df[, "P_M.100"])
  25. sd_X <- my_sd(H_df[, "H"], H_df[, "P_X"])
  26.  
  27. my_sd_v2 <- function(xs, pxs) {
  28.   mu = esp(xs, pxs)
  29.   return (sqrt(sum(pxs*(xs - mu)^2)))
  30. }
  31.  
  32. sd_v2_F <- my_sd_v2(H_df[, "H"], H_df[, "P_F.100"])
  33. sd_v2_M <- my_sd_v2(H_df[, "H"], H_df[, "P_M.100"])
  34. sd_v2_X <- my_sd_v2(H_df[, "H"], H_df[, "P_X"])
  35.  
  36.  
  37.  
  38. # Check normalcy
  39. # Simple test-case
  40. set.seed(2)
  41. var.test(rnorm(300,0,1),rnorm(300,0,1.2))
  42. 2*(1-pf(.8148,299,299,lower.tail=FALSE))
  43.  
  44. values <- c()
  45. for (i in seq(1, 1000)) {
  46.   values <- c(values, var.test(rnorm(300,0,1), rnorm(300,0,1))$p.value)
  47. }
  48. cat("Out of 1000 runs: \n\t  min:",min(values),"\n\t  max:", max(values), "\n\t  mean:", mean(values))
  49.  
  50. # Our current experiment
  51. FvNorm <- var.test(H_df[,"H_F"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
  52. cat("p-value: ", 2*(1 - pf(FvNorm$statistic, 9, 9, lower.tail=FALSE)))
  53. MvNorm <- var.test(H_df[,"H_M"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
  54. cat("p-value: ", 2*(1 - pf(MvNorm$statistic, 9, 9, lower.tail=FALSE)))
  55. XvNorm <- var.test(H_df[,"H_X"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
  56. cat("p-value: ", 2*(1 - pf(XvNorm$statistic, 9, 9, lower.tail=FALSE)))
  57.  
  58. FvM <- var.test(H_df[,"H_M"], H_df[,"H_F"], alternative = "two.sided")
  59. cat("p-value: ", 2*(1 - pf(FvM$statistic, 9, 9, lower.tail=FALSE)))
  60.  
  61. FvM_Z <- sd_F / sd_M *(9/9)
  62. cat("Z-stat:",FvM_Z,".\nIs Z stat lower than:",2*(1-pf(FvM_Z, 9, 9, lower.tail=FALSE)))
  63.  
  64.  
  65. ###################################################
  66. ########### REGRESSION & KOLMOGOROV ###############
  67. ###################################################
  68.  
  69. # Random data, that's correlated :
  70. a <- 1.5
  71. b <- 11
  72. x0 <- 1
  73. x1 <- 10
  74. N <- 1000
  75.  
  76. x <- seq(x0, x1, (x1-x0)/N)
  77. y <- a*x+b
  78. set.seed(2); y_LOW <- a*x+b + rnorm(N+1,0,b/10)
  79. set.seed(2); y_MID <- a*x+b + rnorm(N+1,0,b)
  80. set.seed(2); y_HIGH <- a*x+b + rnorm(N+1,0,b*10)
  81. set.seed(3); y_HIGH2 <- a*x+b + rnorm(N+1,0,b*10)
  82. par(mfrow=c(2,2))
  83. plot(x, y)
  84. plot(x, y_LOW)
  85. plot(x, y_MID)
  86. plot(x, y_HIGH)
  87.  
  88.  
  89. # Compute mean and standard deviation (two ways)
  90. sum(y)/(N+1); sqrt(sum((y-mean(y))^2)/N)
  91. mu <- mean(y); sigma <- sd(y)
  92.  
  93. # Generate random noise
  94. rdata <- rnorm(N+1, mu, sigma)
  95. # Plot both
  96. par(mfrow=c(2, 2))
  97. plot(x, y)
  98. plot(x, rdata)
  99. plot(ecdf(y))
  100. plot(ecdf(rdata))
  101.  
  102. # Kolmogorov-Smirnov two-sample test.
  103. stats <- ks.test(rdata, y)
  104. # Rejecting H0, means both distribution do not follow the same distribution.
  105. cat("Statistic: ", stats$statistic, ">", sqrt(-log(0.05/2)/N), "?\n p-value: ", stats$p.value, "<0.05?")
  106. # We reject H0. This is expected, since one is correlated (see plot) and the other is random noise.
  107. # If you need to compute p-value on your own, see : https://stats.stackexchange.com/questions/389034/kolmogorov-smirnov-test-calculating-the-p-value-manually
  108. # It is rather difficult to get on your own ;). Here is how to retrieve that value:
  109.  
  110. # Compare y_MID with y_HIGH
  111. stats <- ks.test(y_MID, y_HIGH)
  112. stats$statistic > 1.63  # Approximate value for large samples and same size samples
  113. cat(stats$statistic, stats$p.value)
  114. stats <- ks.test(y_HIGH, y_HIGH2)
  115. stats$statistic > 1.63
  116. cat(stats$statistic, stats$p.value)
  117.  
  118. #####################################################
  119. #### Compute the Kolmogorov D statistic manually ####
  120. #####################################################
  121. # The kolmogorov-smirnov compares the distribution (CDF!) from two samples.
  122. # Either one is a given known distribution (like the normal distribution), and the other
  123. # has to be tested against it
  124. # Or both are unknown samples, and we want to see if they both follow the same CDF
  125.  
  126. # Kolmogorov-Smirnov will make use of the "D-statistic" which is simply the
  127. # Maximum difference between the two samples' CDF (or ECDF, Estimated CDF).
  128. # Say we have y1 and y2 two random samples variables
  129. # Then D = max(abs(ECDF(y1) - ECDF(y2))). If low enough, then both should be close to one another!
  130.  
  131. ks_stat <- function(y1, y2=NULL){
  132.   N = length(y1)
  133.   f1 = ecdf(y1)(seq(1, N))
  134.   if (is.null(y2)) {
  135.     f2 = pnorm(seq(1, N), mean(y1), sd(y1))
  136.   }
  137.   else {
  138.     f2 = ecdf(y2)(seq(1, N))
  139.   }
  140.   D = max(abs(f1-f2))
  141.   return (D)
  142. }
  143. ks_stat(y_MID, y_HIGH)
  144. ks_stat(y, y_LOW)
  145. ks_stat(y, rdata)
  146.  
  147. # Have a look at the estimated CDFs!
  148. par(mfrow=c(2,2))
  149. plot(ecdf(y))
  150. plot(ecdf(y_LOW))
  151. plot(ecdf(y_MID))
  152. plot(ecdf(y_HIGH))
  153.  
  154. ##########################
  155. ###### CORRELATION #######
  156. ##########################
  157.  
  158. # Now that we've seen how to compare two different distributions. Let's see if we can find a correlation between variables
  159. # Here, we're gonna use the mildly randomized data y_LOW
  160. par(mfrow=c(1,1))
  161. plot(x, y_LOW)
  162. # We see a clear correlation between x and y_LOW. The equation should follow :
  163. # y_LOW = cx + d with c and d coefficients.
  164.  
  165. # Let's start with arbitrary values for c and d
  166. c = 0
  167. d = 10
  168.  
  169. # Let's assume we find such a function. We'll call it
  170. y_tilde = c*x+d
  171.  
  172. # What we'd expect, is to find the right c and d, that minimize the distance between all the points
  173. # in the graph to that function y_tilde
  174. plot(x, y_LOW)
  175. lines(x, a*x+b, col="red", lwd=3)
  176. lines(x, y_tilde, col="green", lwd=3)
  177. # We want to decrease the distance between the green line and the points, the best solution
  178. # is the red line
  179.  
  180. # To do so, we need to look at the Root Mean Squared Error, which is the square root of
  181. # the squared difference between y_tilde and the random data points y_LOW
  182.  
  183. RMSE <- function(y_tilde, y_ref) {
  184.   return (sqrt(sum((y_tilde-y_ref)^2)/length(y_ref)))
  185. }
  186. MSE <- function(y_tilde, y_ref) {
  187.   return (sqrt(sum((y_tilde-y_ref)^2)/length(y_ref)))
  188. }
  189. RMSE(y_tilde, y_LOW)
  190. RMSE(a*x+b, y_LOW)
  191. # We see that the red line is one order of magnitude lower than the green one
  192.  
  193. # Now, to minimize that RMSE, we need to derive it, and find when its zero.
  194. # RMSE depends on both "c" and "d", thus, the derivation is two-fold.
  195. # Partial derivation against "c" and against "d"
  196. #
  197. # After some computation, one gets:
  198. #
  199. # d/db (RMSE) = -2 sum(y - y_tilde)
  200. # d/da (RMSE) = sum(2a(x-mean(x))^2 - 2(x-mean(x))*(y-mean(y)))
  201. #
  202.  
  203. # Let's try once :
  204. -2 * sum(y_LOW - y_tilde)
  205. -2 * sum(2*c*(x-mean(x))^2 - 2*(x-mean(x))*(y_LOW-mean(y_LOW)))
  206. # With the given values of c and d, we are clearly far from 0
  207. # With a and b:
  208. -2 * sum(y_LOW - (a*x+b))
  209. -2 * sum(2*a*(x-mean(x))^2 - 2*(x-mean(x))*(y_LOW-mean(y_LOW)))
  210. # We're closer to 0. It never can reach zero, due to bias in the model!
  211.  
  212. # Imposing d/db (RMSE) = 0 and d/da (RMSE) = 0 gives:
  213. # a = sum((x-mean(x))*(y-mean(y)))/sum((x-mean(x))^2)
  214. # b = mean(y) - a*mean(x)
  215. # Let's give it a try:
  216. comp_a <- sum((x-mean(x))*(y_LOW-mean(y_LOW)))/sum((x-mean(x))^2)
  217. comp_b <- mean(y_LOW) - a*mean(x)        
  218. cat("Compare a : ", a, comp_a, "\nCompare b: ", b, comp_b)
  219. # Close enough!
  220.  
  221. # Using R built-in methods :
  222. #summary(lm.fit(as.matrix(x), y_LOW))
  223.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement