Loading data and feature selected variables
## load LASSO feature selected variables
coefs <- read.csv("../data/lasso_only_numeric.csv",
stringsAsFactors = FALSE)
## drop intercept coeficient
coefs <- filter(coefs, coefficient != "(Intercept)")
top20vars <- coefs$coefficient[1:20]
## load cleaned data
vih_data <- read.csv("../data/cleandata.csv", stringsAsFactors = FALSE)
## selecting CD4 increase variable as output for lm
vih_data$"output" <- vih_data$Delta_CD4_year1
## processing data for lasso
input <- vih_data[, names(vih_data) %in% c(top20vars, "output")]
Fitting the linear model
## perform linear model
lModel <- lm(output~., data = input)
preds <- predict(lModel, newdata = input)
summary(lModel)
##
## Call:
## lm(formula = output ~ ., data = input)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81.384 -22.534 3.531 24.990 74.294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 236.17815 87.78879 2.690 0.009916 **
## LeucocitosS0 -6.42580 3.16210 -2.032 0.047938 *
## HematocritoS0 -1.39398 1.25261 -1.113 0.271550
## CreatininaS0 -34.28025 24.06836 -1.424 0.161112
## CD4porcentajeS0 -19.33717 2.58284 -7.487 1.71e-09 ***
## CocCD4_CD8_S08 -16.58811 105.91933 -0.157 0.876237
## CD4porcentajeS12 5.58676 3.71513 1.504 0.139472
## CocCD4_CD8_S12 50.72623 161.39470 0.314 0.754715
## GlucosaS24 -0.87249 0.52366 -1.666 0.102482
## Acido_uricoS24 -3.75077 4.41956 -0.849 0.400457
## CreatininaS24 88.97258 37.70871 2.359 0.022601 *
## TGO_S24 -0.18963 0.16837 -1.126 0.265895
## CD4_S24 -0.05159 0.09466 -0.545 0.588387
## CocCD4_CD8_S24 72.04397 138.84589 0.519 0.606334
## CocCD4_CD8_S39 161.35314 78.03988 2.068 0.044332 *
## CD4porcentajeS52 8.87100 3.61355 2.455 0.017933 *
## CD8_S52 0.18229 0.02039 8.939 1.27e-11 ***
## CD8porcentajeS52 -4.28224 1.13940 -3.758 0.000481 ***
## CocCD4_CD8_S52 61.31527 146.59432 0.418 0.677700
## Num_eventos_6_meses 3.95161 5.54243 0.713 0.479463
## CV_S052 -0.41452 0.42214 -0.982 0.331264
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.8 on 46 degrees of freedom
## Multiple R-squared: 0.9076, Adjusted R-squared: 0.8675
## F-statistic: 22.6 on 20 and 46 DF, p-value: < 2.2e-16