Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(cvq2)
- #library(leaps)
- #Q2TEST f-ja kaip ir PHASE Q2 lygiai tokia pati
- q2test<-function(activity, predicted_activity) {
- prediction_error_sq<-(predicted_activity-activity)^2
- avg_activity<-mean(activity)
- sigma_y_sq<-(activity-avg_activity)^2
- q2test_val<-1-sum(prediction_error_sq)/sum(sigma_y_sq)
- return(q2test_val)
- }
- # cia is karto jau pKd skirtumas imtas:
- selectivity<-read.table("selectivityCA12.csv", sep=",", header=TRUE)
- qsar12_1<-data.frame(K<-selectivity$CA12.1)
- qsar12_1$x<-as.matrix(read.table("edragon_descriptors_fix4.csv", sep=",", skip=1))
- #random numbers:
- #> test <- sort(round(runif(10, 1, 40)))
- #> test
- test <- c(2, 3, 8, 9, 10, 16, 18, 19, 29, 35)
- #tada:
- train <- c(1, 4, 5, 6, 7, 11, 12, 13, 14, 15, 17, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40)
- r2=NULL
- r2good=NULL
- for (i in 1:1317 ) {
- fit.single<-lm(qsar12_1$K[train]~qsar12_1$x[train,i])
- r2[i]<-summary(fit.single)$r.squared
- if(r2[i]>0.25) {
- r2good[i]<-r2[i]
- }
- }
- good_deskr_nr<-NULL
- for (i in 1:1317 ) {
- if(r2[i]>0.25) {
- good_deskr_nr<-c(good_deskr_nr, i)
- }
- }
- #leaps<-regsubsets(qsar12_1$K[train]~qsar12_1$x[train,good_deskr_nr], data=qsar12_1, nvmax=3)
- #plot(leaps, scale="r2")
- CA12_1_geri_deskr<-c(105, 333, 1205)
- qsar_12_1_train<-lm(qsar12_1$K[train] ~ qsar12_1$x[train, CA12_1_geri_deskr[1]] + qsar12_1$x[train, CA12_1_geri_deskr[2]] + qsar12_1$x[train, CA12_1_geri_deskr[3]])
- print(summary(qsar_12_1_train))
- qsar_12_1_test_pred_values<-coef(qsar_12_1_train)[1]+coef(qsar_12_1_train)[2]*qsar12_1$x[test, CA12_1_geri_deskr[1]]+coef(qsar_12_1_train)[3]*qsar12_1$x[test, CA12_1_geri_deskr[2]]+coef(qsar_12_1_train)[4]*qsar12_1$x[test, CA12_1_geri_deskr[3]]
- qsar_12_1_test<-lm(qsar_12_1_test_pred_values ~ qsar12_1$K[test])
- print(summary(qsar_12_1_test))
- x<-cbind(qsar12_1$x[train,c(CA12_1_geri_deskr[1], CA12_1_geri_deskr[2], CA12_1_geri_deskr[3])], qsar12_1$K[train])
- colnames(x)[4]<-"y"
- qsar_12_1_q2<-cvq2(x)
- print(qsar_12_1_q2)
- #==========================================
- qsar12_2<-data.frame(K<-selectivity$CA12.2)
- qsar12_2$x<-as.matrix(read.table("edragon_descriptors_fix4.csv", sep=",", skip=1))
- r2=NULL
- r2good=NULL
- for (i in 1:1317 ) {
- fit.single<-lm(qsar12_2$K[train]~qsar12_2$x[train,i])
- r2[i]<-summary(fit.single)$r.squared
- if(r2[i]>0.25) {
- r2good[i]<-r2[i]
- }
- }
- good_deskr_nr<-NULL
- for (i in 1:1317 ) {
- if(r2[i]>0.25) {
- good_deskr_nr<-c(good_deskr_nr, i)
- }
- }
- #kaip ir be leaps variantas:
- #tada imti didziausia is r2good ir salinti koreliacijas su kitais, rasti kuris nekoreliuoja
- #r2very_good istrintos koreliacijos su 1231 kuris labai geras sitame....
- ##leaps<-regsubsets(qsar12_2$K[train]~qsar12_2$x[train,good_deskr_nr], data=qsar12_2, nvmax=3)
- #plot(leaps, scale="r2")
- CA12_2_geri_deskr<-c(292, 308, 311)
- qsar_12_2_train<-lm(qsar12_2$K[train] ~ qsar12_2$x[train, CA12_2_geri_deskr[1]] + qsar12_2$x[train, CA12_2_geri_deskr[2]] + qsar12_2$x[train, CA12_2_geri_deskr[3]])
- print(summary(qsar_12_2_train))
- qsar_12_2_test_pred_values<-coef(qsar_12_2_train)[1]+coef(qsar_12_2_train)[2]*qsar12_2$x[test, CA12_2_geri_deskr[1]]+coef(qsar_12_2_train)[3]*qsar12_2$x[test, CA12_2_geri_deskr[2]]+coef(qsar_12_2_train)[4]*qsar12_2$x[test, CA12_2_geri_deskr[3]]
- qsar_12_2_test<-lm(qsar_12_2_test_pred_values ~ qsar12_2$K[test])
- print(summary(qsar_12_2_test))
- x<-cbind(qsar12_2$x[train,c(CA12_2_geri_deskr[1], CA12_2_geri_deskr[2], CA12_2_geri_deskr[3])], qsar12_2$K[train])
- colnames(x)[4]<-"y"
- qsar_12_2_q2<-cvq2(x)
- print(qsar_12_2_q2)
- #=============================================
- qsar12_6<-data.frame(K<-selectivity$CA12.6)
- qsar12_6$x<-as.matrix(read.table("edragon_descriptors_fix4.csv", sep=",", skip=1))
- r2=NULL
- r2good=NULL
- for (i in 1:1317 ) {
- fit.single<-lm(qsar12_6$K[train]~qsar12_6$x[train,i])
- r2[i]<-summary(fit.single)$r.squared
- if(r2[i]>0.25) {
- r2good[i]<-r2[i]
- }
- }
- good_deskr_nr<-NULL
- for (i in 1:1317 ) {
- if(r2[i]>0.25) {
- good_deskr_nr<-c(good_deskr_nr, i)
- }
- }
- #leaps<-regsubsets(qsar12_6$K[train]~qsar12_6$x[train,good_deskr_nr], data=qsar12_6, nvmax=3)
- #plot(leaps, scale="r2")
- CA12_6_geri_deskr<-c(38, 283, 299)
- qsar_12_6_train<-lm(qsar12_6$K[train] ~ qsar12_6$x[train, CA12_6_geri_deskr[1]] + qsar12_6$x[train, CA12_6_geri_deskr[2]] + qsar12_6$x[train, CA12_6_geri_deskr[3]])
- print(summary(qsar_12_6_train))
- qsar_12_6_test_pred_values<-coef(qsar_12_6_train)[1]+coef(qsar_12_6_train)[2]*qsar12_6$x[test, CA12_6_geri_deskr[1]]+coef(qsar_12_6_train)[3]*qsar12_6$x[test, CA12_6_geri_deskr[2]]+coef(qsar_12_6_train)[4]*qsar12_6$x[test, CA12_6_geri_deskr[3]]
- qsar_12_6_test<-lm(qsar_12_6_test_pred_values ~ qsar12_6$K[test])
- print(summary(qsar_12_6_test))
- x<-cbind(qsar12_6$x[train,c(CA12_6_geri_deskr[1], CA12_6_geri_deskr[2], CA12_6_geri_deskr[3])], qsar12_6$K[train])
- colnames(x)[4]<-"y"
- qsar_12_6_q2<-cvq2(x)
- print(qsar_12_6_q2)
- #===============================================
- qsar12_7<-data.frame(K<-selectivity$CA12.7)
- qsar12_7$x<-as.matrix(read.table("edragon_descriptors_fix4.csv", sep=",", skip=1))
- r2=NULL
- r2good=NULL
- for (i in 1:1317 ) {
- fit.single<-lm(qsar12_7$K[train]~qsar12_7$x[train,i])
- r2[i]<-summary(fit.single)$r.squared
- if(r2[i]>0.25) {
- r2good[i]<-r2[i]
- }
- }
- good_deskr_nr<-NULL
- for (i in 1:1317 ) {
- if(r2[i]>0.25) {
- good_deskr_nr<-c(good_deskr_nr, i)
- }
- }
- #leaps<-regsubsets(qsar12_7$K[train]~qsar12_7$x[train,good_deskr_nr], data=qsar12_7, nvmax=3)
- #plot(leaps, scale="r2")
- CA12_7_geri_deskr<-c(300, 463, 841)
- qsar_12_7_train<-lm(qsar12_7$K[train] ~ qsar12_7$x[train, CA12_7_geri_deskr[1]] + qsar12_7$x[train, CA12_7_geri_deskr[2]] + qsar12_7$x[train, CA12_7_geri_deskr[3]])
- print(summary(qsar_12_7_train))
- qsar_12_7_test_pred_values<-coef(qsar_12_7_train)[1]+coef(qsar_12_7_train)[2]*qsar12_7$x[test, CA12_7_geri_deskr[1]]+coef(qsar_12_7_train)[3]*qsar12_7$x[test, CA12_7_geri_deskr[2]]+coef(qsar_12_7_train)[4]*qsar12_7$x[test, CA12_7_geri_deskr[3]]
- qsar_12_7_test<-lm(qsar_12_7_test_pred_values ~ qsar12_7$K[test])
- print(summary(qsar_12_7_test))
- x<-cbind(qsar12_7$x[train,c(CA12_7_geri_deskr[1], CA12_7_geri_deskr[2], CA12_7_geri_deskr[3])], qsar12_7$K[train])
- colnames(x)[4]<-"y"
- qsar_12_7_q2<-cvq2(x)
- print(qsar_12_7_q2)
- #===============================================
- qsar12_13<-data.frame(K<-selectivity$CA12.13)
- qsar12_13$x<-as.matrix(read.table("edragon_descriptors_fix4.csv", sep=",", skip=1))
- r2=NULL
- r2good=NULL
- for (i in 1:1317 ) {
- fit.single<-lm(qsar12_13$K[train]~qsar12_13$x[train,i])
- r2[i]<-summary(fit.single)$r.squared
- if(r2[i]>0.25) {
- r2good[i]<-r2[i]
- }
- }
- good_deskr_nr<-NULL
- for (i in 1:1317 ) {
- if(r2[i]>0.25) {
- good_deskr_nr<-c(good_deskr_nr, i)
- }
- }
- #leaps<-regsubsets(qsar12_13$K[train]~qsar12_13$x[train,good_deskr_nr], data=qsar12_13, nvmax=3)
- #plot(leaps, scale="r2")
- CA12_13_geri_deskr<-c(278, 300, 1153)
- qsar_12_13_train<-lm(qsar12_13$K[train] ~ qsar12_13$x[train, CA12_13_geri_deskr[1]] + qsar12_13$x[train, CA12_13_geri_deskr[2]] + qsar12_13$x[train, CA12_13_geri_deskr[3]])
- print(summary(qsar_12_13_train))
- qsar_12_13_test_pred_values<-coef(qsar_12_13_train)[1]+coef(qsar_12_13_train)[2]*qsar12_13$x[test, CA12_13_geri_deskr[1]]+coef(qsar_12_13_train)[3]*qsar12_13$x[test, CA12_13_geri_deskr[2]]+coef(qsar_12_13_train)[4]*qsar12_13$x[test, CA12_13_geri_deskr[3]]
- qsar_12_13_test<-lm(qsar_12_13_test_pred_values ~ qsar12_13$K[test])
- print(summary(qsar_12_13_test))
- x<-cbind(qsar12_13$x[train,c(CA12_13_geri_deskr[1], CA12_13_geri_deskr[2], CA12_13_geri_deskr[3])], qsar12_13$K[train])
- colnames(x)[4]<-"y"
- qsar_12_13_q2<-cvq2(x)
- print(qsar_12_13_q2)
- #===============================================
- qsarSUM<-data.frame(K<-selectivity$SUMsCA12)
- qsarSUM$x<-as.matrix(read.table("edragon_descriptors_fix4.csv", sep=",", skip=1))
- r2=NULL
- r2good=NULL
- for (i in 1:1317 ) {
- fit.single<-lm(qsarSUM$K[train]~qsarSUM$x[train,i])
- r2[i]<-summary(fit.single)$r.squared
- if(r2[i]>0.25) {
- r2good[i]<-r2[i]
- }
- }
- good_deskr_nr<-NULL
- for (i in 1:1317 ) {
- if(r2[i]>0.25) {
- good_deskr_nr<-c(good_deskr_nr, i)
- }
- }
- #leaps<-regsubsets(qsarSUM$K[train]~qsarSUM$x[train,good_deskr_nr], data=qsarSUM, nvmax=3)
- #plot(leaps, scale="r2")
- SUM_geri_deskr<-c(292, 695, 1167)
- qsar_SUM_train<-lm(qsarSUM$K[train] ~ qsarSUM$x[train, SUM_geri_deskr[1]] + qsarSUM$x[train, SUM_geri_deskr[2]] + qsarSUM$x[train, SUM_geri_deskr[3]])
- print(summary(qsar_SUM_train))
- qsar_SUM_test_pred_values<-coef(qsar_SUM_train)[1]+coef(qsar_SUM_train)[2]*qsarSUM$x[test, SUM_geri_deskr[1]]+coef(qsar_SUM_train)[3]*qsarSUM$x[test, SUM_geri_deskr[2]]+coef(qsar_SUM_train)[4]*qsarSUM$x[test, SUM_geri_deskr[3]]
- qsar_SUM_test<-lm(qsar_SUM_test_pred_values ~ qsarSUM$K[test])
- print(summary(qsar_SUM_test))
- x<-cbind(qsarSUM$x[train,c(SUM_geri_deskr[1], SUM_geri_deskr[2], SUM_geri_deskr[3])], qsarSUM$K[train])
- colnames(x)[4]<-"y"
- qsar_SUM_q2<-cvq2(x)
- print(qsar_SUM_q2)
- #===============================================
- #grafiko asys nuo/iki:
- minK<--7.4
- maxK<-1.8
- qsar12_1_sv<-coef(qsar_12_1_train)[1]+coef(qsar_12_1_train)[2]*qsar12_1$x[, CA12_1_geri_deskr[1]]+coef(qsar_12_1_train)[3]*qsar12_1$x[, CA12_1_geri_deskr[2]]+coef(qsar_12_1_train)[4]*qsar12_1$x[, CA12_1_geri_deskr[3]]
- qsar12_2_sv<-coef(qsar_12_2_train)[1]+coef(qsar_12_2_train)[2]*qsar12_2$x[, CA12_2_geri_deskr[1]]+coef(qsar_12_2_train)[3]*qsar12_2$x[, CA12_2_geri_deskr[2]]+coef(qsar_12_2_train)[4]*qsar12_2$x[, CA12_2_geri_deskr[3]]
- qsar12_6_sv<-coef(qsar_12_6_train)[1]+coef(qsar_12_6_train)[2]*qsar12_6$x[, CA12_6_geri_deskr[1]]+coef(qsar_12_6_train)[3]*qsar12_6$x[, CA12_6_geri_deskr[2]]+coef(qsar_12_6_train)[4]*qsar12_6$x[, CA12_6_geri_deskr[3]]
- qsar12_7_sv<-coef(qsar_12_7_train)[1]+coef(qsar_12_7_train)[2]*qsar12_7$x[, CA12_7_geri_deskr[1]]+coef(qsar_12_7_train)[3]*qsar12_7$x[, CA12_7_geri_deskr[2]]+coef(qsar_12_7_train)[4]*qsar12_7$x[, CA12_7_geri_deskr[3]]
- qsar12_13_sv<-coef(qsar_12_13_train)[1]+coef(qsar_12_13_train)[2]*qsar12_13$x[, CA12_13_geri_deskr[1]]+coef(qsar_12_13_train)[3]*qsar12_13$x[, CA12_13_geri_deskr[2]]+coef(qsar_12_13_train)[4]*qsar12_13$x[, CA12_13_geri_deskr[3]]
- qsarSUM_sv<-coef(qsar_SUM_train)[1]+coef(qsar_SUM_train)[2]*qsarSUM$x[, SUM_geri_deskr[1]]+coef(qsar_SUM_train)[3]*qsarSUM$x[, SUM_geri_deskr]+coef(qsar_SUM_train)[4]*qsarSUM$x[, SUM_geri_deskr[3]]
- #grafikui<-cbind(qsar$K, qsar1, qsar2, qsar3)
- #colnames(grafikui)[1]<-"pKd"
- mod_qsar12_1<-lm(qsar12_1_sv[train]~qsar12_1$K[train])
- mod_qsar12_2<-lm(qsar12_2_sv[train]~qsar12_2$K[train])
- mod_qsar12_6<-lm(qsar12_6_sv[train]~qsar12_6$K[train])
- mod_qsar12_7<-lm(qsar12_7_sv[train]~qsar12_7$K[train])
- mod_qsar12_13<-lm(qsar12_13_sv[train]~qsar12_13$K[train])
- mod_qsarSUM<-lm(qsarSUM_sv[train]~qsarSUM$K[train])
- mod_qsar12_1t<-lm(qsar12_1_sv[test]~qsar12_1$K[test])
- mod_qsar12_2t<-lm(qsar12_2_sv[test]~qsar12_2$K[test])
- mod_qsar12_6t<-lm(qsar12_6_sv[test]~qsar12_6$K[test])
- mod_qsar12_7t<-lm(qsar12_7_sv[test]~qsar12_7$K[test])
- mod_qsar12_13t<-lm(qsar12_13_sv[test]~qsar12_13$K[test])
- mod_qsarSUMt<-lm(qsarSUM_sv[test]~qsarSUM$K[test])
- png("Edita2013_specifiskumo_grafikas_train.png", width=600, height=900)
- par(mfrow=c(3,2), mar=c(1,1,0,0), oma=c(6,6,0,0), cex.axis=2)
- plot(qsar12_1$K[train], qsar12_1_sv[train], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE, xaxt="n")
- abline(mod_qsar12_1)
- title(bquote(atop("CA XII vs CA I", R^2==0.84)), line = -3, cex.main=3)
- #title('QSAR 12 vs 1 train set', line = -3, cex.main=3)
- axis(1,col.axis = "transparent", tck = 0.02)
- plot(qsar12_2$K[train], qsar12_2_sv[train], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), xlab=NA, ylab=NA, xaxt="n", yaxt="n")
- abline(mod_qsar12_2)
- title(bquote(atop("CA XII vs CA II", R^2==0.81)), line = -3, cex.main=3)
- #title('QSAR 12 vs 2 train set', line = -3, cex.main=3)
- axis(1,col.axis = "transparent", tck = 0.02)
- axis(2,col.axis = "transparent", tck = 0.02)
- plot(qsar12_6$K[train], qsar12_6_sv[train], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE, xaxt="n")
- abline(mod_qsar12_6)
- title(bquote(atop("CA XII vs CA VI", R^2==0.82)), line = -3, cex.main=3)
- #title('QSAR 12 vs 6 train set', line = -3, cex.main=3)
- plot(qsar12_7$K[train], qsar12_7_sv[train], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE, xaxt="n", yaxt="n")
- abline(mod_qsar12_7)
- title(bquote(atop("CA XII vs CA VII", R^2==0.83)), line = -3, cex.main=3)
- #title('QSAR 12 vs 7 train set', line = -3, cex.main=3)
- axis(1,col.axis = "transparent", tck = 0.02)
- axis(2,col.axis = "transparent", tck = 0.02)
- plot(qsar12_13$K[train], qsar12_13_sv[train], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE)
- abline(mod_qsar12_13)
- title(bquote(atop("CA XII vs CA XIII", R^2==0.78)), line = -3, cex.main=3)
- #title('QSAR 12 vs 13 train set', line = -3, cex.main=3)
- plot(qsarSUM$K[train], qsarSUM_sv[train], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), yaxt="n", cex.lab=3)
- abline(mod_qsarSUM)
- title(expression(paste(sum(), "CA XII, ", R^2==0.78)), line = -3, cex.main=3)
- #title('QSAR sum train set', line = -3, cex.main=3)
- axis(2,col.axis = "transparent", tck = 0.02)
- mtext(expression(paste(pK[d], ' difference (experimental)')), SOUTH<-1, line=2.5, cex=2, outer=TRUE)
- mtext(expression(paste(pK[d], ' difference (calculated)')), WEST<-2, line=2.5, cex=2, outer=TRUE)
- dev.off()
- png("Edita2013_specifiskumo_grafikas_test.png", width=600, height=900)
- par(mfrow=c(3,2), mar=c(1,1,0,0), oma=c(6,6,0,0), cex.axis=2)
- plot(qsar12_1$K[test], qsar12_1_sv[test], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE, xaxt="n")
- abline(mod_qsar12_1t)
- title(bquote(atop("CA XII vs CA I", R^2==0.79)), line = -3, cex.main=3)
- #title('QSAR 12 vs 1 test set', line = -3, cex.main=3)
- axis(1,col.axis = "transparent", tck = 0.02)
- plot(qsar12_2$K[test], qsar12_2_sv[test], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), xlab=NA, ylab=NA, xaxt="n", yaxt="n")
- abline(mod_qsar12_2t)
- title(bquote(atop("CA XII vs CA II", R^2==0.58)), line = -3, cex.main=3)
- #title('QSAR 12 vs 2 test set', line = -3, cex.main=3)
- axis(1,col.axis = "transparent", tck = 0.02)
- axis(2,col.axis = "transparent", tck = 0.02)
- plot(qsar12_6$K[test], qsar12_6_sv[test], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE, xaxt="n")
- abline(mod_qsar12_6t)
- title(bquote(atop("CA XII vs CA VI", R^2==0.42)), line = -3, cex.main=3)
- #title('QSAR 12 vs 6 test set', line = -3, cex.main=3)
- plot(qsar12_7$K[test], qsar12_7_sv[test], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE, xaxt="n", yaxt="n")
- abline(mod_qsar12_7t)
- title(bquote(atop("CA XII vs CA VII", R^2==0.49)), line = -3, cex.main=3)
- #title('QSAR 12 vs 7 test set', line = -3, cex.main=3)
- axis(1,col.axis = "transparent", tck = 0.02)
- axis(2,col.axis = "transparent", tck = 0.02)
- plot(qsar12_13$K[test], qsar12_13_sv[test], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), ann=FALSE)
- abline(mod_qsar12_13t)
- title(bquote(atop("CA XII vs CA XIII", R^2==0.71)), line = -3, cex.main=3)
- #title('QSAR 12 vs 13 test set', line = -3, cex.main=3)
- plot(qsarSUM$K[test], qsarSUM_sv[test], tck = 0.02, pch=15, cex=3, xlim=c(minK, maxK), ylim=c(minK, maxK), yaxt="n", cex.lab=3)
- abline(mod_qsarSUMt)
- title(expression(paste(sum(), "CA XII, ", R^2==0.58)), line = -3, cex.main=3)
- #title('QSAR sum test set', line = -3, cex.main=3)
- axis(2,col.axis = "transparent", tck = 0.02)
- mtext(expression(paste(pK[d], ' difference (experimental)')), SOUTH<-1, line=2.5, cex=2, outer=TRUE)
- mtext(expression(paste(pK[d], ' difference (calculated)')), WEST<-2, line=2.5, cex=2, outer=TRUE)
- dev.off()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement