Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- F_M <- c(0.0, 0.0, 0.8, 6.6, 24.1, 54.2, 78.7, 92.6, 97.9, 100)
- F_F <- c(3.0, 14.1, 36.2, 65.7, 88.9, 97.1, 99.9, 100, 100, 100)
- P_M <- c(F_M[1], diff(F_M))
- P_F <- c(F_F[1], diff(F_F))
- Height_range <- c(150, 155, 160, 165, 170, 175, 180, 185, 190, 195)
- H_df = data.frame(Height_range, F_M, F_F, P_M/100, P_F/100)
- library(dplyr)
- H_df <- H_df |> mutate(H=Height_range-5, F_X=cumsum((P_M + P_F)*0.5), P_X = (P_M.100 + P_F.100)*0.5, H_X = H*P_X, H_M = H*P_M.100, H_F = H*P_F.100)
- esp <- function(xs, pxs) {
- return (sum(xs*pxs))
- }
- mean_F <- esp(H_df[, "H"], H_df[, "P_F.100"])
- mean_M <- esp(H_df[, "H"], H_df[, "P_M.100"])
- mean_X <- esp(H_df[, "H"], H_df[, "P_X"])
- my_sd <- function(xs, pxs) {
- return (sqrt(esp(xs^2, pxs) - esp(xs, pxs)^2))
- }
- sd_F <- my_sd(H_df[, "H"], H_df[, "P_F.100"])
- sd_M <- my_sd(H_df[, "H"], H_df[, "P_M.100"])
- sd_X <- my_sd(H_df[, "H"], H_df[, "P_X"])
- my_sd_v2 <- function(xs, pxs) {
- mu = esp(xs, pxs)
- return (sqrt(sum(pxs*(xs - mu)^2)))
- }
- sd_v2_F <- my_sd_v2(H_df[, "H"], H_df[, "P_F.100"])
- sd_v2_M <- my_sd_v2(H_df[, "H"], H_df[, "P_M.100"])
- sd_v2_X <- my_sd_v2(H_df[, "H"], H_df[, "P_X"])
- # Check normalcy
- # Simple test-case
- set.seed(2)
- var.test(rnorm(300,0,1),rnorm(300,0,1.2))
- 2*(1-pf(.8148,299,299,lower.tail=FALSE))
- values <- c()
- for (i in seq(1, 1000)) {
- values <- c(values, var.test(rnorm(300,0,1), rnorm(300,0,1))$p.value)
- }
- cat("Out of 1000 runs: \n\t min:",min(values),"\n\t max:", max(values), "\n\t mean:", mean(values))
- # Our current experiment
- FvNorm <- var.test(H_df[,"H_F"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
- cat("p-value: ", 2*(1 - pf(FvNorm$statistic, 9, 9, lower.tail=FALSE)))
- MvNorm <- var.test(H_df[,"H_M"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
- cat("p-value: ", 2*(1 - pf(MvNorm$statistic, 9, 9, lower.tail=FALSE)))
- XvNorm <- var.test(H_df[,"H_X"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
- cat("p-value: ", 2*(1 - pf(XvNorm$statistic, 9, 9, lower.tail=FALSE)))
- FvM <- var.test(H_df[,"H_M"], H_df[,"H_F"], alternative = "two.sided")
- cat("p-value: ", 2*(1 - pf(FvM$statistic, 9, 9, lower.tail=FALSE)))
- FvM_Z <- sd_F / sd_M *(9/9)
- cat("Z-stat:",FvM_Z,".\nIs Z stat lower than:",2*(1-pf(FvM_Z, 9, 9, lower.tail=FALSE)))
- ###################################################
- ########### REGRESSION & KOLMOGOROV ###############
- ###################################################
- # Random data, that's correlated :
- a <- 1.5
- b <- 11
- x0 <- 1
- x1 <- 10
- N <- 1000
- x <- seq(x0, x1, (x1-x0)/N)
- y <- a*x+b
- set.seed(2); y_LOW <- a*x+b + rnorm(N+1,0,b/10)
- set.seed(2); y_MID <- a*x+b + rnorm(N+1,0,b)
- set.seed(2); y_HIGH <- a*x+b + rnorm(N+1,0,b*10)
- set.seed(3); y_HIGH2 <- a*x+b + rnorm(N+1,0,b*10)
- par(mfrow=c(2,2))
- plot(x, y)
- plot(x, y_LOW)
- plot(x, y_MID)
- plot(x, y_HIGH)
- # Compute mean and standard deviation (two ways)
- sum(y)/(N+1); sqrt(sum((y-mean(y))^2)/N)
- mu <- mean(y); sigma <- sd(y)
- # Generate random noise
- rdata <- rnorm(N+1, mu, sigma)
- # Plot both
- par(mfrow=c(2, 2))
- plot(x, y)
- plot(x, rdata)
- plot(ecdf(y))
- plot(ecdf(rdata))
- # Kolmogorov-Smirnov two-sample test.
- stats <- ks.test(rdata, y)
- # Rejecting H0, means both distribution do not follow the same distribution.
- cat("Statistic: ", stats$statistic, ">", sqrt(-log(0.05/2)/N), "?\n p-value: ", stats$p.value, "<0.05?")
- # We reject H0. This is expected, since one is correlated (see plot) and the other is random noise.
- # If you need to compute p-value on your own, see : https://stats.stackexchange.com/questions/389034/kolmogorov-smirnov-test-calculating-the-p-value-manually
- # It is rather difficult to get on your own ;). Here is how to retrieve that value:
- # Compare y_MID with y_HIGH
- stats <- ks.test(y_MID, y_HIGH)
- stats$statistic > 1.63 # Approximate value for large samples and same size samples
- cat(stats$statistic, stats$p.value)
- stats <- ks.test(y_HIGH, y_HIGH2)
- stats$statistic > 1.63
- cat(stats$statistic, stats$p.value)
- #####################################################
- #### Compute the Kolmogorov D statistic manually ####
- #####################################################
- # The kolmogorov-smirnov compares the distribution (CDF!) from two samples.
- # Either one is a given known distribution (like the normal distribution), and the other
- # has to be tested against it
- # Or both are unknown samples, and we want to see if they both follow the same CDF
- # Kolmogorov-Smirnov will make use of the "D-statistic" which is simply the
- # Maximum difference between the two samples' CDF (or ECDF, Estimated CDF).
- # Say we have y1 and y2 two random samples variables
- # Then D = max(abs(ECDF(y1) - ECDF(y2))). If low enough, then both should be close to one another!
- ks_stat <- function(y1, y2=NULL){
- N = length(y1)
- f1 = ecdf(y1)(seq(1, N))
- if (is.null(y2)) {
- f2 = pnorm(seq(1, N), mean(y1), sd(y1))
- }
- else {
- f2 = ecdf(y2)(seq(1, N))
- }
- D = max(abs(f1-f2))
- return (D)
- }
- ks_stat(y_MID, y_HIGH)
- ks_stat(y, y_LOW)
- ks_stat(y, rdata)
- # Have a look at the estimated CDFs!
- par(mfrow=c(2,2))
- plot(ecdf(y))
- plot(ecdf(y_LOW))
- plot(ecdf(y_MID))
- plot(ecdf(y_HIGH))
- ##########################
- ###### CORRELATION #######
- ##########################
- # Now that we've seen how to compare two different distributions. Let's see if we can find a correlation between variables
- # Here, we're gonna use the mildly randomized data y_LOW
- par(mfrow=c(1,1))
- plot(x, y_LOW)
- # We see a clear correlation between x and y_LOW. The equation should follow :
- # y_LOW = cx + d with c and d coefficients.
- # Let's start with arbitrary values for c and d
- c = 0
- d = 10
- # Let's assume we find such a function. We'll call it
- y_tilde = c*x+d
- # What we'd expect, is to find the right c and d, that minimize the distance between all the points
- # in the graph to that function y_tilde
- plot(x, y_LOW)
- lines(x, a*x+b, col="red", lwd=3)
- lines(x, y_tilde, col="green", lwd=3)
- # We want to decrease the distance between the green line and the points, the best solution
- # is the red line
- # To do so, we need to look at the Root Mean Squared Error, which is the square root of
- # the squared difference between y_tilde and the random data points y_LOW
- RMSE <- function(y_tilde, y_ref) {
- return (sqrt(sum((y_tilde-y_ref)^2)/length(y_ref)))
- }
- MSE <- function(y_tilde, y_ref) {
- return (sqrt(sum((y_tilde-y_ref)^2)/length(y_ref)))
- }
- RMSE(y_tilde, y_LOW)
- RMSE(a*x+b, y_LOW)
- # We see that the red line is one order of magnitude lower than the green one
- # Now, to minimize that RMSE, we need to derive it, and find when its zero.
- # RMSE depends on both "c" and "d", thus, the derivation is two-fold.
- # Partial derivation against "c" and against "d"
- #
- # After some computation, one gets:
- #
- # d/db (RMSE) = -2 sum(y - y_tilde)
- # d/da (RMSE) = sum(2a(x-mean(x))^2 - 2(x-mean(x))*(y-mean(y)))
- #
- # Let's try once :
- -2 * sum(y_LOW - y_tilde)
- -2 * sum(2*c*(x-mean(x))^2 - 2*(x-mean(x))*(y_LOW-mean(y_LOW)))
- # With the given values of c and d, we are clearly far from 0
- # With a and b:
- -2 * sum(y_LOW - (a*x+b))
- -2 * sum(2*a*(x-mean(x))^2 - 2*(x-mean(x))*(y_LOW-mean(y_LOW)))
- # We're closer to 0. It never can reach zero, due to bias in the model!
- # Imposing d/db (RMSE) = 0 and d/da (RMSE) = 0 gives:
- # a = sum((x-mean(x))*(y-mean(y)))/sum((x-mean(x))^2)
- # b = mean(y) - a*mean(x)
- # Let's give it a try:
- comp_a <- sum((x-mean(x))*(y_LOW-mean(y_LOW)))/sum((x-mean(x))^2)
- comp_b <- mean(y_LOW) - a*mean(x)
- cat("Compare a : ", a, comp_a, "\nCompare b: ", b, comp_b)
- # Close enough!
- # Using R built-in methods :
- #summary(lm.fit(as.matrix(x), y_LOW))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement