library(tidyverse) # for graphing and data cleaning
library(tidymodels) # for modeling
library(stacks) # for stacking models
library(naniar) # for analyzing missing values
library(vip) # for variable importance plots
library(usemodels) # for tidymodels suggestions
library(xgboost) # for boosting - need to install, don't need to load
library(doParallel) # for parallel processing
library(lubridate) # for dates
library(moderndive) # for King County housing data
library(patchwork) # for combining plots nicely
library(rmarkdown) # for paged tables
theme_set(theme_minimal()) # Lisa's favorite theme
Lasso Model
pollution <- read.csv("data_standardized.csv")
pollution %>%
select(where(is.numeric)) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(vars(variable),
scales = "free",
nrow = 5)
pollution %>%
select(Alzheimer.s.disease) %>%
ggplot(aes(x = Alzheimer.s.disease)) +
geom_histogram()
for(i in 1:ncol(pollution)) {
median <- median(pollution[,i],na.rm = TRUE)
for (j in 1:nrow(pollution)){
if (is.na(pollution[j, i])){
pollution[j, i] <- median
}
}
}
pollution_mod <- pollution %>%
mutate(log_alzheimers = Alzheimer.s.disease) %>%
filter(log_alzheimers != "-Inf") %>%
select(-Alzheimer.s.disease) %>%
mutate(across(where(is.character), as.factor)) %>%
add_n_miss() %>%
filter(n_miss_all == 0) %>%
select(-n_miss_all)
set.seed(456)
pollution_split <- initial_split(pollution_mod, prop = .75)
pollution_training <- training(pollution_split)
pollution_testing <- testing(pollution_split)
pollution_recipe <- recipe(log_alzheimers ~ ., data = pollution_training) %>%
step_rm(County,
asthma_er_avg,
Year,
All.causes..total.,
# Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Assault..homicide.,
Diseases.of.heart,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
Stomach_Surgery_Ct) %>%
step_normalize(all_predictors(),
-all_nominal()) %>%
step_dummy(all_nominal(),
-all_outcomes())
pollution_recipe %>%
prep(pollution_training) %>%
juice()
## # A tibble: 114 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.769 1.91 -0.438 1.20
## 2 -0.791 -0.0862 -0.176 -0.426
## 3 0.0376 -0.153 -0.176 0.136
## 4 0.130 -0.118 -0.176 0.0213
## 5 -0.851 0.0999 -0.969 -0.949
## 6 -0.625 -0.107 -0.860 -0.442
## 7 0.861 -0.0905 -0.176 -0.150
## 8 1.70 0.645 -0.176 -1.08
## 9 -1.25 -1.03 -0.176 -0.150
## 10 3.09 -0.0905 -0.176 -0.150
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
pollution_lasso_mod <-
linear_reg(mixture = 1) %>%
set_engine("glmnet") %>%
set_args(penalty = tune()) %>%
set_mode("regression")
pollution_lasso_wf <-
workflow() %>%
add_recipe(pollution_recipe) %>%
add_model(pollution_lasso_mod)
penalty_grid <- grid_regular(penalty(),
levels = 20)
penalty_grid
## # A tibble: 20 × 1
## penalty
## <dbl>
## 1 1 e-10
## 2 3.36e-10
## 3 1.13e- 9
## 4 3.79e- 9
## 5 1.27e- 8
## 6 4.28e- 8
## 7 1.44e- 7
## 8 4.83e- 7
## 9 1.62e- 6
## 10 5.46e- 6
## 11 1.83e- 5
## 12 6.16e- 5
## 13 2.07e- 4
## 14 6.95e- 4
## 15 2.34e- 3
## 16 7.85e- 3
## 17 2.64e- 2
## 18 8.86e- 2
## 19 2.98e- 1
## 20 1 e+ 0
ctrl_grid <- control_stack_grid()
set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)
pollution_lasso_tune <-
pollution_lasso_wf %>%
tune_grid(
resamples = pollution_cv,
grid = penalty_grid,
control = ctrl_grid
)
pollution_lasso_tune
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [91/23]> Fold1 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 2 <split [91/23]> Fold2 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 3 <split [91/23]> Fold3 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 4 <split [91/23]> Fold4 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 5 <split [92/22]> Fold5 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
pollution_lasso_tune %>%
select(id, .metrics) %>%
unnest(.metrics) %>%
filter(.metric == "rmse")
## # A tibble: 100 × 6
## id penalty .metric .estimator .estimate .config
## <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 Fold1 1 e-10 rmse standard 0.000124 Preprocessor1_Model01
## 2 Fold1 3.36e-10 rmse standard 0.000124 Preprocessor1_Model02
## 3 Fold1 1.13e- 9 rmse standard 0.000124 Preprocessor1_Model03
## 4 Fold1 3.79e- 9 rmse standard 0.000124 Preprocessor1_Model04
## 5 Fold1 1.27e- 8 rmse standard 0.000124 Preprocessor1_Model05
## 6 Fold1 4.28e- 8 rmse standard 0.000124 Preprocessor1_Model06
## 7 Fold1 1.44e- 7 rmse standard 0.000124 Preprocessor1_Model07
## 8 Fold1 4.83e- 7 rmse standard 0.000125 Preprocessor1_Model08
## 9 Fold1 1.62e- 6 rmse standard 0.000130 Preprocessor1_Model09
## 10 Fold1 5.46e- 6 rmse standard 0.000145 Preprocessor1_Model10
## # … with 90 more rows
pollution_lasso_tune %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10",scales::math_format(10^.x))) +
labs(x = "penalty", y = "rmse")
pollution_lasso_tune %>%
show_best(metric = "rmse")
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00000162 rmse standard 0.000120 5 0.00000604 Preprocessor1_Model…
## 2 0.000000483 rmse standard 0.000123 5 0.00000371 Preprocessor1_Model…
## 3 0.00000546 rmse standard 0.000124 5 0.00000891 Preprocessor1_Model…
## 4 0.000000144 rmse standard 0.000127 5 0.00000444 Preprocessor1_Model…
## 5 0.0000000428 rmse standard 0.000128 5 0.00000535 Preprocessor1_Model…
best_param <- pollution_lasso_tune %>%
select_best(metric = "rmse")
best_param
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 0.00000162 Preprocessor1_Model09
one_se_param <- pollution_lasso_tune %>%
select_by_one_std_err(metric = "rmse", desc(penalty))
one_se_param
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.00000546 rmse standard 0.000124 5 8.91e-6 Preproc… 1.20e-4 1.26e-4
pollution_lasso_final_wf <- pollution_lasso_wf %>%
finalize_workflow(one_se_param)
pollution_lasso_final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 5.45559478116851e-06
## mixture = 1
##
## Computational engine: glmnet
pollution_lasso_final_mod <- pollution_lasso_final_wf %>%
fit(data = pollution_training)
pollution_lasso_final_mod %>%
pull_workflow_fit() %>%
tidy()
## # A tibble: 33 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 0.000446 0.00000546
## 2 annual_mean_pm25 0.00000324 0.00000546
## 3 annual_mean_ozone 0.000000405 0.00000546
## 4 annual_mean_pb -0.0000543 0.00000546
## 5 annual_mean_pm10 -0.000000167 0.00000546
## 6 annual_mean_co 0 0.00000546
## 7 annual_mean_no2 -0.000000644 0.00000546
## 8 annual_mean_so2 0 0.00000546
## 9 hpi2score 0 0.00000546
## 10 economic 0 0.00000546
## # … with 23 more rows
pollution_lasso_test <- pollution_lasso_final_wf %>%
last_fit(pollution_split)
# Metrics for model applied to test data
pollution_lasso_test %>%
collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.000136 Preprocessor1_Model1
## 2 rsq standard 0.216 Preprocessor1_Model1
Xgboost model
use_xgboost(log_alzheimers ~ ., data = pollution_training)
## xgboost_recipe <-
## recipe(formula = log_alzheimers ~ ., data = pollution_training) %>%
## step_novel(all_nominal(), -all_outcomes()) %>%
## step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("regression") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(90624)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
boost_recipe <-
recipe(formula = log_alzheimers ~ ., data = pollution_training) %>%
step_rm(County,
Year,
All.causes..total.,
#Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Diseases.of.heart,
Assault..homicide.,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
asthma_er_avg,
Stomach_Surgery_Ct) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors(),
-all_nominal())
boost_recipe %>%
prep() %>%
juice()
## # A tibble: 114 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.769 1.91 -0.438 1.20
## 2 -0.791 -0.0862 -0.176 -0.426
## 3 0.0376 -0.153 -0.176 0.136
## 4 0.130 -0.118 -0.176 0.0213
## 5 -0.851 0.0999 -0.969 -0.949
## 6 -0.625 -0.107 -0.860 -0.442
## 7 0.861 -0.0905 -0.176 -0.150
## 8 1.70 0.645 -0.176 -1.08
## 9 -1.25 -1.03 -0.176 -0.150
## 10 3.09 -0.0905 -0.176 -0.150
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
boost_spec <- boost_tree(
trees = 1000, # number of trees, T in the equations above
tree_depth = 2, # max number of splits in the tree
min_n = 5, # min points required for node to be further split
loss_reduction = 10^-5, # when to stop - smaller = more since it only has to get a little bit better
sample_size = 1, # proportion of training data to use
learn_rate = tune(), # lambda from the equations above
stop_iter = 50 # number of iterations w/o improvement b4 stopping
) %>%
set_engine("xgboost", colsample_bytree = 1) %>%
set_mode("regression")
boost_grid <- grid_regular(learn_rate(),
levels = 10)
boost_grid
## # A tibble: 10 × 1
## learn_rate
## <dbl>
## 1 0.0000000001
## 2 0.000000001
## 3 0.00000001
## 4 0.0000001
## 5 0.000001
## 6 0.00001
## 7 0.0001
## 8 0.001
## 9 0.01
## 10 0.1
boost_wf <- workflow() %>%
add_recipe(boost_recipe) %>%
add_model(boost_spec)
set.seed(456)
val_split <- validation_split(pollution_training,
prop = .6)
val_split
## # Validation Set Split (0.6/0.4)
## # A tibble: 1 × 2
## splits id
## <list> <chr>
## 1 <split [68/46]> validation
set.seed(456)
registerDoParallel()
boost_tune <- boost_wf %>%
tune_grid(
# resamples = val_split,
resamples = pollution_cv,
grid = boost_grid,
control = ctrl_grid
)
collect_metrics(boost_tune)
## # A tibble: 20 × 7
## learn_rate .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 rmse standard 0.500 5 0.0000116 Preprocessor1_M…
## 2 0.0000000001 rsq standard NaN 0 NA Preprocessor1_M…
## 3 0.000000001 rmse standard 0.500 5 0.0000116 Preprocessor1_M…
## 4 0.000000001 rsq standard NaN 0 NA Preprocessor1_M…
## 5 0.00000001 rmse standard 0.500 5 0.0000116 Preprocessor1_M…
## 6 0.00000001 rsq standard NaN 0 NA Preprocessor1_M…
## 7 0.0000001 rmse standard 0.499 5 0.0000115 Preprocessor1_M…
## 8 0.0000001 rsq standard NaN 0 NA Preprocessor1_M…
## 9 0.000001 rmse standard 0.499 5 0.0000115 Preprocessor1_M…
## 10 0.000001 rsq standard NaN 0 NA Preprocessor1_M…
## 11 0.00001 rmse standard 0.495 5 0.0000116 Preprocessor1_M…
## 12 0.00001 rsq standard NaN 0 NA Preprocessor1_M…
## 13 0.0001 rmse standard 0.453 5 0.0000118 Preprocessor1_M…
## 14 0.0001 rsq standard NaN 0 NA Preprocessor1_M…
## 15 0.001 rmse standard 0.186 5 0.0000139 Preprocessor1_M…
## 16 0.001 rsq standard NaN 0 NA Preprocessor1_M…
## 17 0.01 rmse standard 0.000164 5 0.00000790 Preprocessor1_M…
## 18 0.01 rsq standard NaN 0 NA Preprocessor1_M…
## 19 0.1 rmse standard 0.000162 5 0.00000936 Preprocessor1_M…
## 20 0.1 rsq standard NaN 0 NA Preprocessor1_M…
collect_metrics(boost_tune) %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = learn_rate, y = mean)) +
geom_point() +
geom_line() +
scale_x_log10() +
labs(y = "rmse") +
theme_minimal()
best_lr <- select_best(boost_tune, "rmse")
best_lr
## # A tibble: 1 × 2
## learn_rate .config
## <dbl> <chr>
## 1 0.1 Preprocessor1_Model10
# finalize workflow
final_boost_wf <- finalize_workflow(
boost_wf,
best_lr
)
# fit final
final_boost <- final_boost_wf %>%
fit(data = pollution_training)
final_boost %>%
pull_workflow_fit() %>%
vip(geom = "col")
# Use model on test data
test_preds <- pollution_testing %>%
bind_cols(predict(final_boost, new_data = pollution_testing))
# Compute test rmse
test_preds %>%
summarize(rmse = sqrt(mean((log_alzheimers - .pred)^2))) %>%
pull(rmse)
## [1] 0.0001609894
# Graph results
test_preds %>%
ggplot(aes(x = log_alzheimers,
y = .pred)) +
geom_point(alpha = .5,
size = .5) +
geom_smooth(se = FALSE) +
geom_abline(slope = 1,
intercept = 0,
color = "darkred") +
labs(x = "Actual log(asthma)",
y = "Predicted log(asthma)")
Random Forest Model
pollution_rfrecipe <- recipe(log_alzheimers ~ ., data = pollution_training) %>%
step_rm(County,
Year,
All.causes..total.,
# Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Assault..homicide.,
Diseases.of.heart,
asthma_er_avg,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
Stomach_Surgery_Ct) %>%
step_normalize(all_predictors(),
-all_nominal()) %>%
step_dummy(all_nominal(),
-all_outcomes())
pollution_rfrecipe %>%
prep(pollution_training) %>%
juice()
## # A tibble: 114 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.769 1.91 -0.438 1.20
## 2 -0.791 -0.0862 -0.176 -0.426
## 3 0.0376 -0.153 -0.176 0.136
## 4 0.130 -0.118 -0.176 0.0213
## 5 -0.851 0.0999 -0.969 -0.949
## 6 -0.625 -0.107 -0.860 -0.442
## 7 0.861 -0.0905 -0.176 -0.150
## 8 1.70 0.645 -0.176 -1.08
## 9 -1.25 -1.03 -0.176 -0.150
## 10 3.09 -0.0905 -0.176 -0.150
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
ranger_pollution <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 200) %>%
set_mode("regression") %>%
set_engine("ranger")
pollution_rfworkflow <-
workflow() %>%
add_recipe(pollution_rfrecipe) %>%
add_model(ranger_pollution)
# Set up penalty grid model
rf_penalty_grid <- grid_regular(finalize(mtry(),
pollution_training %>%
select(-log_alzheimers)),
min_n(),
levels = 3)
set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)
# Tune model
pollution_rfTUNE <-
pollution_rfworkflow %>%
tune_grid(
resamples = pollution_cv,
grid = rf_penalty_grid,
control = ctrl_grid
)
best_rmse <-
pollution_rfTUNE %>%
select_best(metric = "rmse")
best_rmse
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 1 2 Preprocessor1_Model1
pol_rfFinal <- pollution_rfworkflow %>%
finalize_workflow(best_rmse) %>%
fit(data = pollution_training)
# OOB error (MSE)
pol_rfFinal$fit$fit$fit$prediction.error
## [1] 9.722797e-09
# OOB RMSE
sqrt(pol_rfFinal$fit$fit$fit$prediction.error)
## [1] 9.860424e-05
set.seed(456)
metric <- metric_set(rmse)
ctrl_res <- control_stack_resamples()
pollution_rfcv <- pollution_rfworkflow %>%
fit_resamples(pollution_cv,
metrics = metric,
control = ctrl_res)
collect_metrics(pollution_rfTUNE)
## # A tibble: 18 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 2 rmse standard 0.0000973 5 0.00000979 Preprocessor1_Mode…
## 2 1 2 rsq standard 0.725 5 0.0766 Preprocessor1_Mode…
## 3 32 2 rmse standard 0.000103 5 0.0000133 Preprocessor1_Mode…
## 4 32 2 rsq standard 0.626 5 0.115 Preprocessor1_Mode…
## 5 63 2 rmse standard 0.0000996 5 0.0000133 Preprocessor1_Mode…
## 6 63 2 rsq standard 0.651 5 0.116 Preprocessor1_Mode…
## 7 1 21 rmse standard 0.000123 5 0.0000102 Preprocessor1_Mode…
## 8 1 21 rsq standard 0.615 5 0.0784 Preprocessor1_Mode…
## 9 32 21 rmse standard 0.000113 5 0.0000147 Preprocessor1_Mode…
## 10 32 21 rsq standard 0.560 5 0.116 Preprocessor1_Mode…
## 11 63 21 rmse standard 0.000115 5 0.0000146 Preprocessor1_Mode…
## 12 63 21 rsq standard 0.551 5 0.119 Preprocessor1_Mode…
## 13 1 40 rmse standard 0.000136 5 0.0000105 Preprocessor1_Mode…
## 14 1 40 rsq standard 0.486 5 0.0754 Preprocessor1_Mode…
## 15 32 40 rmse standard 0.000131 5 0.0000145 Preprocessor1_Mode…
## 16 32 40 rsq standard 0.406 5 0.115 Preprocessor1_Mode…
## 17 63 40 rmse standard 0.000130 5 0.0000132 Preprocessor1_Mode…
## 18 63 40 rsq standard 0.422 5 0.105 Preprocessor1_Mode…
Stacking
pollution_stack <-
stacks() %>%
add_candidates(boost_tune) %>%
add_candidates(pollution_lasso_tune) %>%
add_candidates(pollution_rfTUNE)
as_tibble(pollution_stack)
## # A tibble: 114 × 26
## log_alzheimers boost_tune_1_01 boost_tune_1_04 boost_tune_1_05
## <dbl> <dbl> <dbl> <dbl>
## 1 0.000394 0.5 0.500 0.499
## 2 0.000458 0.5 0.500 0.499
## 3 0.000466 0.5 0.500 0.499
## 4 0.000787 0.5 0.500 0.499
## 5 0.000698 0.5 0.500 0.499
## 6 0.000489 0.5 0.500 0.499
## 7 0.000429 0.5 0.500 0.499
## 8 0.000405 0.5 0.500 0.499
## 9 0.000632 0.5 0.500 0.499
## 10 0.000429 0.5 0.500 0.499
## # … with 104 more rows, and 22 more variables: boost_tune_1_06 <dbl>,
## # boost_tune_1_07 <dbl>, boost_tune_1_08 <dbl>, boost_tune_1_09 <dbl>,
## # boost_tune_1_10 <dbl>, pollution_lasso_tune_1_01 <dbl>,
## # pollution_lasso_tune_1_06 <dbl>, pollution_lasso_tune_1_07 <dbl>,
## # pollution_lasso_tune_1_08 <dbl>, pollution_lasso_tune_1_09 <dbl>,
## # pollution_lasso_tune_1_10 <dbl>, pollution_lasso_tune_1_11 <dbl>,
## # pollution_lasso_tune_1_12 <dbl>, pollution_rfTUNE_1_1 <dbl>, …
set.seed(456)
pollution_blend <-
pollution_stack %>%
blend_predictions()
pollution_blend
## # A tibble: 4 × 3
## member type weight
## <chr> <chr> <dbl>
## 1 boost_tune_1_05 boost_tree 2.56
## 2 pollution_rfTUNE_1_1 rand_forest 1.37
## 3 boost_tune_1_07 boost_tree 1.36
## 4 pollution_lasso_tune_1_09 linear_reg 0.00602
pollution_blend$metrics %>%
filter(.metric == "rmse")
## # A tibble: 6 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.000001 1 rmse standard 0.0000930 25 0.00000253 Preprocessor1_…
## 2 0.00001 1 rmse standard 0.0000918 25 0.00000227 Preprocessor1_…
## 3 0.0001 1 rmse standard 0.000132 25 0.00000428 Preprocessor1_…
## 4 0.001 1 rmse standard 0.000157 25 0.00000370 Preprocessor1_…
## 5 0.01 1 rmse standard 0.000157 25 0.00000370 Preprocessor1_…
## 6 0.1 1 rmse standard 0.000157 25 0.00000370 Preprocessor1_…
autoplot(pollution_blend)
autoplot(pollution_blend, type = "members")
autoplot(pollution_blend, type = "weights")
pollution_final_stack <- pollution_blend %>%
fit_members()
pollution_final_stack
## # A tibble: 4 × 3
## member type weight
## <chr> <chr> <dbl>
## 1 boost_tune_1_05 boost_tree 2.56
## 2 pollution_rfTUNE_1_1 rand_forest 1.37
## 3 boost_tune_1_07 boost_tree 1.36
## 4 pollution_lasso_tune_1_09 linear_reg 0.00602
pollution_final_stack %>%
predict(new_data = pollution_testing) %>%
bind_cols(pollution_testing) %>%
ggplot(aes(x = log_alzheimers,
y = .pred)) +
geom_point(alpha = .5,
size = .5) +
geom_smooth(se = FALSE) +
geom_abline(slope = 1,
intercept = 0,
color = "darkred") +
labs(x = "Actual log(alzheimer's disease)",
y = "Predicted log(alzheimer's disease)")
library(Metrics)
data_rmse <- pollution_final_stack %>%
predict(new_data = pollution_testing) %>%
bind_cols(pollution_testing)
rmse(data_rmse$log_alzheimers, data_rmse$.pred)
## [1] 0.0001533423
write_rds(pollution_final_stack, "alzheimers_model.rds")