CARE

F_M <- c(0.0, 0.0, 0.8, 6.6, 24.1, 54.2, 78.7, 92.6, 97.9, 100)
F_F <- c(3.0, 14.1, 36.2, 65.7, 88.9, 97.1, 99.9, 100, 100, 100)
P_M <- c(F_M[1], diff(F_M))
P_F <- c(F_F[1], diff(F_F))
Height_range <- c(150, 155, 160, 165, 170, 175, 180, 185, 190, 195)
H_df = data.frame(Height_range, F_M, F_F, P_M/100, P_F/100)

library(dplyr)
H_df <- H_df |> mutate(H=Height_range-5, F_X=cumsum((P_M + P_F)*0.5), P_X = (P_M.100 + P_F.100)*0.5, H_X = H*P_X, H_M = H*P_M.100, H_F = H*P_F.100)

esp <- function(xs, pxs) {
  return (sum(xs*pxs))
}

mean_F <- esp(H_df[, "H"], H_df[, "P_F.100"])
mean_M <- esp(H_df[, "H"], H_df[, "P_M.100"])
mean_X <- esp(H_df[, "H"], H_df[, "P_X"])

my_sd <- function(xs, pxs) {
  return (sqrt(esp(xs^2, pxs) - esp(xs, pxs)^2))
}

sd_F <- my_sd(H_df[, "H"], H_df[, "P_F.100"])
sd_M <- my_sd(H_df[, "H"], H_df[, "P_M.100"])
sd_X <- my_sd(H_df[, "H"], H_df[, "P_X"])

my_sd_v2 <- function(xs, pxs) {
  mu = esp(xs, pxs)
  return (sqrt(sum(pxs*(xs - mu)^2)))
}

sd_v2_F <- my_sd_v2(H_df[, "H"], H_df[, "P_F.100"])
sd_v2_M <- my_sd_v2(H_df[, "H"], H_df[, "P_M.100"])
sd_v2_X <- my_sd_v2(H_df[, "H"], H_df[, "P_X"])


# Check normalcy
# Simple test-case
set.seed(2)
var.test(rnorm(300,0,1),rnorm(300,0,1.2))
2*(1-pf(.8148,299,299,lower.tail=FALSE))

values <- c()
for (i in seq(1, 1000)) {
  values <- c(values, var.test(rnorm(300,0,1), rnorm(300,0,1))$p.value)
}
cat("Out of 1000 runs: \n\t  min:",min(values),"\n\t  max:", max(values), "\n\t  mean:", mean(values))

# Our current experiment
FvNorm <- var.test(H_df[,"H_F"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
cat("p-value: ", 2*(1 - pf(FvNorm$statistic, 9, 9, lower.tail=FALSE)))
MvNorm <- var.test(H_df[,"H_M"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
cat("p-value: ", 2*(1 - pf(MvNorm$statistic, 9, 9, lower.tail=FALSE)))
XvNorm <- var.test(H_df[,"H_X"], rnorm(9, mean_F, sd_F), alternative = "two.sided")
cat("p-value: ", 2*(1 - pf(XvNorm$statistic, 9, 9, lower.tail=FALSE)))

FvM <- var.test(H_df[,"H_M"], H_df[,"H_F"], alternative = "two.sided")
cat("p-value: ", 2*(1 - pf(FvM$statistic, 9, 9, lower.tail=FALSE)))

FvM_Z <- sd_F / sd_M *(9/9)
cat("Z-stat:",FvM_Z,".\nIs Z stat lower than:",2*(1-pf(FvM_Z, 9, 9, lower.tail=FALSE)))


###################################################
########### REGRESSION & KOLMOGOROV ###############
###################################################

# Random data, that's correlated :
a <- 1.5
b <- 11
x0 <- 1
x1 <- 10
N <- 1000

x <- seq(x0, x1, (x1-x0)/N)
y <- a*x+b
set.seed(2); y_LOW <- a*x+b + rnorm(N+1,0,b/10)
set.seed(2); y_MID <- a*x+b + rnorm(N+1,0,b)
set.seed(2); y_HIGH <- a*x+b + rnorm(N+1,0,b*10)
set.seed(3); y_HIGH2 <- a*x+b + rnorm(N+1,0,b*10)
par(mfrow=c(2,2))
plot(x, y)
plot(x, y_LOW)
plot(x, y_MID)
plot(x, y_HIGH)


# Compute mean and standard deviation (two ways)
sum(y)/(N+1); sqrt(sum((y-mean(y))^2)/N)
mu <- mean(y); sigma <- sd(y)

# Generate random noise
rdata <- rnorm(N+1, mu, sigma)
# Plot both
par(mfrow=c(2, 2))
plot(x, y)
plot(x, rdata)
plot(ecdf(y))
plot(ecdf(rdata))

# Kolmogorov-Smirnov two-sample test.
stats <- ks.test(rdata, y)
# Rejecting H0, means both distribution do not follow the same distribution.
cat("Statistic: ", stats$statistic, ">", sqrt(-log(0.05/2)/N), "?\n p-value: ", stats$p.value, "<0.05?")
# We reject H0. This is expected, since one is correlated (see plot) and the other is random noise.
# If you need to compute p-value on your own, see : https://stats.stackexchange.com/questions/389034/kolmogorov-smirnov-test-calculating-the-p-value-manually
# It is rather difficult to get on your own ;). Here is how to retrieve that value:

# Compare y_MID with y_HIGH
stats <- ks.test(y_MID, y_HIGH)
stats$statistic > 1.63  # Approximate value for large samples and same size samples
cat(stats$statistic, stats$p.value)
stats <- ks.test(y_HIGH, y_HIGH2)
stats$statistic > 1.63
cat(stats$statistic, stats$p.value)

#####################################################
#### Compute the Kolmogorov D statistic manually ####
#####################################################
# The kolmogorov-smirnov compares the distribution (CDF!) from two samples.
# Either one is a given known distribution (like the normal distribution), and the other
# has to be tested against it
# Or both are unknown samples, and we want to see if they both follow the same CDF

# Kolmogorov-Smirnov will make use of the "D-statistic" which is simply the
# Maximum difference between the two samples' CDF (or ECDF, Estimated CDF).
# Say we have y1 and y2 two random samples variables
# Then D = max(abs(ECDF(y1) - ECDF(y2))). If low enough, then both should be close to one another!

ks_stat <- function(y1, y2=NULL){
  N = length(y1)
  f1 = ecdf(y1)(seq(1, N))
  if (is.null(y2)) {
    f2 = pnorm(seq(1, N), mean(y1), sd(y1))
  }
  else {
    f2 = ecdf(y2)(seq(1, N))
  }
  D = max(abs(f1-f2))
  return (D)
}
ks_stat(y_MID, y_HIGH)
ks_stat(y, y_LOW)
ks_stat(y, rdata)

# Have a look at the estimated CDFs!
par(mfrow=c(2,2))
plot(ecdf(y))
plot(ecdf(y_LOW))
plot(ecdf(y_MID))
plot(ecdf(y_HIGH))

##########################
###### CORRELATION #######
##########################

# Now that we've seen how to compare two different distributions. Let's see if we can find a correlation between variables
# Here, we're gonna use the mildly randomized data y_LOW
par(mfrow=c(1,1))
plot(x, y_LOW)
# We see a clear correlation between x and y_LOW. The equation should follow :
# y_LOW = cx + d with c and d coefficients.

# Let's start with arbitrary values for c and d
c = 0
d = 10

# Let's assume we find such a function. We'll call it
y_tilde = c*x+d

# What we'd expect, is to find the right c and d, that minimize the distance between all the points
# in the graph to that function y_tilde
plot(x, y_LOW)
lines(x, a*x+b, col="red", lwd=3)
lines(x, y_tilde, col="green", lwd=3)
# We want to decrease the distance between the green line and the points, the best solution
# is the red line

# To do so, we need to look at the Root Mean Squared Error, which is the square root of
# the squared difference between y_tilde and the random data points y_LOW

RMSE <- function(y_tilde, y_ref) {
  return (sqrt(sum((y_tilde-y_ref)^2)/length(y_ref)))
}
MSE <- function(y_tilde, y_ref) {
  return (sqrt(sum((y_tilde-y_ref)^2)/length(y_ref)))
}
RMSE(y_tilde, y_LOW)
RMSE(a*x+b, y_LOW)
# We see that the red line is one order of magnitude lower than the green one

# Now, to minimize that RMSE, we need to derive it, and find when its zero.
# RMSE depends on both "c" and "d", thus, the derivation is two-fold.
# Partial derivation against "c" and against "d"
#
# After some computation, one gets:
#
# d/db (RMSE) = -2 sum(y - y_tilde)
# d/da (RMSE) = sum(2a(x-mean(x))^2 - 2(x-mean(x))*(y-mean(y)))
#

# Let's try once :
-2 * sum(y_LOW - y_tilde)
-2 * sum(2*c*(x-mean(x))^2 - 2*(x-mean(x))*(y_LOW-mean(y_LOW)))
# With the given values of c and d, we are clearly far from 0
# With a and b:
-2 * sum(y_LOW - (a*x+b))
-2 * sum(2*a*(x-mean(x))^2 - 2*(x-mean(x))*(y_LOW-mean(y_LOW)))
# We're closer to 0. It never can reach zero, due to bias in the model!

# Imposing d/db (RMSE) = 0 and d/da (RMSE) = 0 gives:
# a = sum((x-mean(x))*(y-mean(y)))/sum((x-mean(x))^2)
# b = mean(y) - a*mean(x)
# Let's give it a try:
comp_a <- sum((x-mean(x))*(y_LOW-mean(y_LOW)))/sum((x-mean(x))^2)
comp_b <- mean(y_LOW) - a*mean(x)
cat("Compare a : ", a, comp_a, "\nCompare b: ", b, comp_b)
# Close enough!

# Using R built-in methods :
#summary(lm.fit(as.matrix(x), y_LOW))