Sumss

Lab2 - assignment1

Dec 3rd, 2021 (edited)
388
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.40 KB | None | 0 0
  1. data = read.csv("tecator.csv", header=TRUE)
  2. data = as.data.frame(data)
  3.  
  4. n = dim(data)[1]
  5. set.seed(12345)
  6. id = sample(1:n, floor(n*0.5))
  7.  
  8. data = subset(data,
  9. select = -c(Protein, Moisture, Sample))
  10.  
  11. train = data[id,]
  12. test = data[-id,]
  13.  
  14. train_x = subset(train, select = -c(Fat))
  15. train_y = train$Fat
  16.  
  17. test_x = subset(test, select = -c(Fat))
  18. test_y = test$Fat
  19.  
  20. fit1 = lm( formula = train_y ~ ., data=train_x)
  21. summary(fit1)
  22.  
  23. train_mse = mean((fit1$residuals)^2)
  24.  
  25. test_pred = predict(fit1, test_x)
  26. # Getting EXTREMELY high value? why?
  27. test_mse = mean((test_y - test_pred)^2)
  28.  
  29. # very low mse, overfit?
  30. train_mse
  31. # test mse is 722, overfit on training data?
  32. test_mse
  33.  
  34. # Question 2 & 3
  35.  
  36. # Special for lasso
  37. # is that alpha=1
  38. library(glmnet)
  39. fit_lasso = glmnet(as.matrix(train_x),
  40. train_y,
  41. alpha=1,
  42. family="gaussian")
  43.  
  44. # when lambda is ~ -0.3 we can see 3 coefficients
  45. # that are not 0
  46. plot(fit_lasso, xvar="lambda")
  47.  
  48. # Ridge
  49. # special for ridge is that alpha=0
  50. fit_ridge = glmnet(as.matrix(train_x),
  51. train_y,
  52. alpha=0,
  53. family="gaussian")
  54.  
  55. plot(fit_ridge, xvar="lambda")
  56.  
  57. # Conclusions:
  58. # Both ridge and lasso regression have the
  59. # same charectaristics in that they both
  60. # introduce a penalty parameter lambda.
  61. # But we can see that in LASSO regression
  62. # we penalize the fetures much harder than
  63. # in the ridge regression. For almost
  64. # every lambda, we can see that in LASSO, almost always
  65. # some features are completely removd (set to zero),
  66. # meanwhile in ridge regression we dont completely remove
  67. # the features but just reguralize them.
  68.  
  69. # Question 5
  70.  
  71. cv_fit = cv.glmnet(as.matrix(train_x),
  72. train_y,
  73. alpha=1,
  74. family="gaussian",
  75. nfolds=10)
  76. # By looking at this plot
  77. # if the dots are within the grey bars,
  78. # it means that it is NOT statistically
  79. # significantly better and vice versa
  80. plot(cv_fit)
  81.  
  82. # optimal lambda
  83. log(cv_fit$lambda.min)
  84.  
  85. # 9 features used
  86. coef(cv_fit, s="lambda.min")
  87.  
  88. newfit = glmnet(as.matrix(train_x),
  89. train_y,
  90. alpha=1,
  91. family="gaussian",
  92. lambda=cv_fit$lambda.min)
  93.  
  94. yhat = predict(newfit, as.matrix(test_x), nfolds=10)
  95. plot(test_y, yhat, col="blue")
  96. lines(x=c(0:100), y=(0:100))
  97.  
Add Comment
Please, Sign In to add comment