library(tidyverse)         # for graphing and data cleaning
library(tidymodels)        # for modeling
library(stacks)            # for stacking models
library(naniar)            # for analyzing missing values
library(vip)               # for variable importance plots
library(usemodels)         # for tidymodels suggestions
library(xgboost)           # for boosting - need to install, don't need to load
library(doParallel)        # for parallel processing
library(lubridate)         # for dates
library(moderndive)        # for King County housing data
library(patchwork)         # for combining plots nicely
library(rmarkdown)         # for paged tables
theme_set(theme_minimal()) # Lisa's favorite theme

Lasso Model

pollution <- read.csv("data_standardized.csv")
pollution %>% 
  select(where(is.numeric)) %>% 
  pivot_longer(cols = everything(),
               names_to = "variable", 
               values_to = "value") %>% 
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(vars(variable), 
             scales = "free",
             nrow = 5)

pollution %>% 
  select(Alzheimer.s.disease) %>% 
  ggplot(aes(x = Alzheimer.s.disease)) +
  geom_histogram()

for(i in 1:ncol(pollution)) {
  median <- median(pollution[,i],na.rm = TRUE)
  for (j in 1:nrow(pollution)){
    if (is.na(pollution[j, i])){
      pollution[j, i] <- median
    }
  }
}
pollution_mod <- pollution %>%
  mutate(log_alzheimers = Alzheimer.s.disease) %>% 
  filter(log_alzheimers != "-Inf") %>% 
  select(-Alzheimer.s.disease) %>% 
  mutate(across(where(is.character), as.factor)) %>% 
  add_n_miss() %>% 
  filter(n_miss_all == 0) %>% 
  select(-n_miss_all)

set.seed(456)
pollution_split <- initial_split(pollution_mod, prop = .75)
pollution_training <- training(pollution_split)
pollution_testing <- testing(pollution_split)
pollution_recipe <- recipe(log_alzheimers ~ ., data = pollution_training) %>% 
  step_rm(County,
          asthma_er_avg,
          Year,
          All.causes..total.,
         # Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Assault..homicide.,
          Diseases.of.heart,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
          Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          Stomach_Surgery_Ct) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal()) %>% 
  step_dummy(all_nominal(), 
             -all_outcomes())
pollution_recipe %>% 
  prep(pollution_training) %>%
  juice()
## # A tibble: 114 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.769             1.91           -0.438           1.20  
##  2          -0.791            -0.0862         -0.176          -0.426 
##  3           0.0376           -0.153          -0.176           0.136 
##  4           0.130            -0.118          -0.176           0.0213
##  5          -0.851             0.0999         -0.969          -0.949 
##  6          -0.625            -0.107          -0.860          -0.442 
##  7           0.861            -0.0905         -0.176          -0.150 
##  8           1.70              0.645          -0.176          -1.08  
##  9          -1.25             -1.03           -0.176          -0.150 
## 10           3.09             -0.0905         -0.176          -0.150 
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
pollution_lasso_mod <- 
  linear_reg(mixture = 1) %>% 
  set_engine("glmnet") %>% 
  set_args(penalty = tune()) %>% 
  set_mode("regression")
pollution_lasso_wf <- 
  workflow() %>% 
  add_recipe(pollution_recipe) %>% 
  add_model(pollution_lasso_mod)
penalty_grid <- grid_regular(penalty(),
                             levels = 20)
penalty_grid 
## # A tibble: 20 × 1
##     penalty
##       <dbl>
##  1 1   e-10
##  2 3.36e-10
##  3 1.13e- 9
##  4 3.79e- 9
##  5 1.27e- 8
##  6 4.28e- 8
##  7 1.44e- 7
##  8 4.83e- 7
##  9 1.62e- 6
## 10 5.46e- 6
## 11 1.83e- 5
## 12 6.16e- 5
## 13 2.07e- 4
## 14 6.95e- 4
## 15 2.34e- 3
## 16 7.85e- 3
## 17 2.64e- 2
## 18 8.86e- 2
## 19 2.98e- 1
## 20 1   e+ 0
ctrl_grid <- control_stack_grid()

set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)

pollution_lasso_tune <- 
  pollution_lasso_wf %>% 
  tune_grid(
    resamples = pollution_cv,
    grid = penalty_grid,
    control = ctrl_grid
    )

pollution_lasso_tune
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 5
##   splits          id    .metrics          .notes           .predictions      
##   <list>          <chr> <list>            <list>           <list>            
## 1 <split [91/23]> Fold1 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 2 <split [91/23]> Fold2 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 3 <split [91/23]> Fold3 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 4 <split [91/23]> Fold4 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [460 × 5]>
## 5 <split [92/22]> Fold5 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
pollution_lasso_tune %>% 
  select(id, .metrics) %>% 
  unnest(.metrics) %>% 
  filter(.metric == "rmse")
## # A tibble: 100 × 6
##    id     penalty .metric .estimator .estimate .config              
##    <chr>    <dbl> <chr>   <chr>          <dbl> <chr>                
##  1 Fold1 1   e-10 rmse    standard    0.000124 Preprocessor1_Model01
##  2 Fold1 3.36e-10 rmse    standard    0.000124 Preprocessor1_Model02
##  3 Fold1 1.13e- 9 rmse    standard    0.000124 Preprocessor1_Model03
##  4 Fold1 3.79e- 9 rmse    standard    0.000124 Preprocessor1_Model04
##  5 Fold1 1.27e- 8 rmse    standard    0.000124 Preprocessor1_Model05
##  6 Fold1 4.28e- 8 rmse    standard    0.000124 Preprocessor1_Model06
##  7 Fold1 1.44e- 7 rmse    standard    0.000124 Preprocessor1_Model07
##  8 Fold1 4.83e- 7 rmse    standard    0.000125 Preprocessor1_Model08
##  9 Fold1 1.62e- 6 rmse    standard    0.000130 Preprocessor1_Model09
## 10 Fold1 5.46e- 6 rmse    standard    0.000145 Preprocessor1_Model10
## # … with 90 more rows
pollution_lasso_tune %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = penalty, y = mean)) +
  geom_point() +
  geom_line() +
  scale_x_log10(
   breaks = scales::trans_breaks("log10", function(x) 10^x),
   labels = scales::trans_format("log10",scales::math_format(10^.x))) +
  labs(x = "penalty", y = "rmse")

pollution_lasso_tune %>% 
  show_best(metric = "rmse")
## # A tibble: 5 × 7
##        penalty .metric .estimator     mean     n    std_err .config             
##          <dbl> <chr>   <chr>         <dbl> <int>      <dbl> <chr>               
## 1 0.00000162   rmse    standard   0.000120     5 0.00000604 Preprocessor1_Model…
## 2 0.000000483  rmse    standard   0.000123     5 0.00000371 Preprocessor1_Model…
## 3 0.00000546   rmse    standard   0.000124     5 0.00000891 Preprocessor1_Model…
## 4 0.000000144  rmse    standard   0.000127     5 0.00000444 Preprocessor1_Model…
## 5 0.0000000428 rmse    standard   0.000128     5 0.00000535 Preprocessor1_Model…
best_param <- pollution_lasso_tune %>% 
  select_best(metric = "rmse")
best_param
## # A tibble: 1 × 2
##      penalty .config              
##        <dbl> <chr>                
## 1 0.00000162 Preprocessor1_Model09
one_se_param <- pollution_lasso_tune %>% 
  select_by_one_std_err(metric = "rmse", desc(penalty))
one_se_param
## # A tibble: 1 × 9
##      penalty .metric .estimator     mean     n  std_err .config    .best  .bound
##        <dbl> <chr>   <chr>         <dbl> <int>    <dbl> <chr>      <dbl>   <dbl>
## 1 0.00000546 rmse    standard   0.000124     5  8.91e-6 Preproc… 1.20e-4 1.26e-4
pollution_lasso_final_wf <- pollution_lasso_wf %>% 
  finalize_workflow(one_se_param)
pollution_lasso_final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_rm()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 5.45559478116851e-06
##   mixture = 1
## 
## Computational engine: glmnet
pollution_lasso_final_mod <- pollution_lasso_final_wf %>% 
  fit(data = pollution_training)

pollution_lasso_final_mod %>% 
  pull_workflow_fit() %>% 
  tidy() 
## # A tibble: 33 × 3
##    term                  estimate    penalty
##    <chr>                    <dbl>      <dbl>
##  1 (Intercept)        0.000446    0.00000546
##  2 annual_mean_pm25   0.00000324  0.00000546
##  3 annual_mean_ozone  0.000000405 0.00000546
##  4 annual_mean_pb    -0.0000543   0.00000546
##  5 annual_mean_pm10  -0.000000167 0.00000546
##  6 annual_mean_co     0           0.00000546
##  7 annual_mean_no2   -0.000000644 0.00000546
##  8 annual_mean_so2    0           0.00000546
##  9 hpi2score          0           0.00000546
## 10 economic           0           0.00000546
## # … with 23 more rows
pollution_lasso_test <- pollution_lasso_final_wf %>% 
  last_fit(pollution_split)

# Metrics for model applied to test data
pollution_lasso_test %>% 
  collect_metrics()
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard    0.000136 Preprocessor1_Model1
## 2 rsq     standard    0.216    Preprocessor1_Model1

Xgboost model

use_xgboost(log_alzheimers ~ ., data = pollution_training)
## xgboost_recipe <- 
##   recipe(formula = log_alzheimers ~ ., data = pollution_training) %>% 
##   step_novel(all_nominal(), -all_outcomes()) %>% 
##   step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("regression") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(90624)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
boost_recipe <- 
  recipe(formula = log_alzheimers ~ ., data = pollution_training) %>% 
  step_rm(County,
          Year,
          All.causes..total.,
          #Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Diseases.of.heart,
          Assault..homicide.,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
          Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          asthma_er_avg,
          Stomach_Surgery_Ct) %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)  %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal())
boost_recipe %>% 
  prep() %>% 
  juice() 
## # A tibble: 114 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.769             1.91           -0.438           1.20  
##  2          -0.791            -0.0862         -0.176          -0.426 
##  3           0.0376           -0.153          -0.176           0.136 
##  4           0.130            -0.118          -0.176           0.0213
##  5          -0.851             0.0999         -0.969          -0.949 
##  6          -0.625            -0.107          -0.860          -0.442 
##  7           0.861            -0.0905         -0.176          -0.150 
##  8           1.70              0.645          -0.176          -1.08  
##  9          -1.25             -1.03           -0.176          -0.150 
## 10           3.09             -0.0905         -0.176          -0.150 
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
boost_spec <- boost_tree(
  trees = 1000,            # number of trees, T in the equations above
  tree_depth = 2,          # max number of splits in the tree
  min_n = 5,               # min points required for node to be further split
  loss_reduction = 10^-5,  # when to stop - smaller = more since it only has to get a little bit better 
  sample_size = 1,         # proportion of training data to use
  learn_rate = tune(),     # lambda from the equations above
  stop_iter = 50           # number of iterations w/o improvement b4 stopping
) %>% 
  set_engine("xgboost", colsample_bytree = 1) %>% 
  set_mode("regression")
boost_grid <- grid_regular(learn_rate(),
                           levels = 10)
boost_grid
## # A tibble: 10 × 1
##      learn_rate
##           <dbl>
##  1 0.0000000001
##  2 0.000000001 
##  3 0.00000001  
##  4 0.0000001   
##  5 0.000001    
##  6 0.00001     
##  7 0.0001      
##  8 0.001       
##  9 0.01        
## 10 0.1
boost_wf <- workflow() %>% 
  add_recipe(boost_recipe) %>%
  add_model(boost_spec) 
set.seed(456)
val_split <- validation_split(pollution_training, 
                              prop = .6)
val_split
## # Validation Set Split (0.6/0.4)  
## # A tibble: 1 × 2
##   splits          id        
##   <list>          <chr>     
## 1 <split [68/46]> validation
set.seed(456)
registerDoParallel()

boost_tune <- boost_wf %>% 
  tune_grid(
  # resamples = val_split,
  resamples = pollution_cv,
  grid = boost_grid,
  control = ctrl_grid
)
collect_metrics(boost_tune)
## # A tibble: 20 × 7
##      learn_rate .metric .estimator       mean     n     std_err .config         
##           <dbl> <chr>   <chr>           <dbl> <int>       <dbl> <chr>           
##  1 0.0000000001 rmse    standard     0.500        5  0.0000116  Preprocessor1_M…
##  2 0.0000000001 rsq     standard   NaN            0 NA          Preprocessor1_M…
##  3 0.000000001  rmse    standard     0.500        5  0.0000116  Preprocessor1_M…
##  4 0.000000001  rsq     standard   NaN            0 NA          Preprocessor1_M…
##  5 0.00000001   rmse    standard     0.500        5  0.0000116  Preprocessor1_M…
##  6 0.00000001   rsq     standard   NaN            0 NA          Preprocessor1_M…
##  7 0.0000001    rmse    standard     0.499        5  0.0000115  Preprocessor1_M…
##  8 0.0000001    rsq     standard   NaN            0 NA          Preprocessor1_M…
##  9 0.000001     rmse    standard     0.499        5  0.0000115  Preprocessor1_M…
## 10 0.000001     rsq     standard   NaN            0 NA          Preprocessor1_M…
## 11 0.00001      rmse    standard     0.495        5  0.0000116  Preprocessor1_M…
## 12 0.00001      rsq     standard   NaN            0 NA          Preprocessor1_M…
## 13 0.0001       rmse    standard     0.453        5  0.0000118  Preprocessor1_M…
## 14 0.0001       rsq     standard   NaN            0 NA          Preprocessor1_M…
## 15 0.001        rmse    standard     0.186        5  0.0000139  Preprocessor1_M…
## 16 0.001        rsq     standard   NaN            0 NA          Preprocessor1_M…
## 17 0.01         rmse    standard     0.000164     5  0.00000790 Preprocessor1_M…
## 18 0.01         rsq     standard   NaN            0 NA          Preprocessor1_M…
## 19 0.1          rmse    standard     0.000162     5  0.00000936 Preprocessor1_M…
## 20 0.1          rsq     standard   NaN            0 NA          Preprocessor1_M…
collect_metrics(boost_tune) %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = learn_rate, y = mean)) +
  geom_point() +
  geom_line() +
  scale_x_log10() +
  labs(y = "rmse") +
  theme_minimal()

best_lr <- select_best(boost_tune, "rmse")
best_lr
## # A tibble: 1 × 2
##   learn_rate .config              
##        <dbl> <chr>                
## 1        0.1 Preprocessor1_Model10
# finalize workflow
final_boost_wf <- finalize_workflow(
  boost_wf,
  best_lr
)

# fit final
final_boost <- final_boost_wf %>% 
  fit(data = pollution_training)
final_boost %>% 
  pull_workflow_fit() %>%
  vip(geom = "col")
# Use model on test data
test_preds <- pollution_testing %>% 
  bind_cols(predict(final_boost, new_data = pollution_testing)) 

# Compute test rmse
test_preds %>% 
  summarize(rmse = sqrt(mean((log_alzheimers - .pred)^2))) %>% 
  pull(rmse)
## [1] 0.0001609894
# Graph results
test_preds %>% 
  ggplot(aes(x = log_alzheimers,
             y = .pred)) +
  geom_point(alpha = .5, 
             size = .5) +
  geom_smooth(se = FALSE) +
  geom_abline(slope = 1, 
              intercept = 0, 
              color = "darkred") +
  labs(x = "Actual log(asthma)", 
       y = "Predicted log(asthma)")

Random Forest Model

pollution_rfrecipe <- recipe(log_alzheimers ~ ., data = pollution_training) %>% 
  step_rm(County,
          Year,
          All.causes..total.,
        #  Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Assault..homicide.,
          Diseases.of.heart,
          asthma_er_avg,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
          Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          Stomach_Surgery_Ct) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal()) %>% 
  step_dummy(all_nominal(), 
             -all_outcomes())
pollution_rfrecipe %>% 
  prep(pollution_training) %>%
  juice()
## # A tibble: 114 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.769             1.91           -0.438           1.20  
##  2          -0.791            -0.0862         -0.176          -0.426 
##  3           0.0376           -0.153          -0.176           0.136 
##  4           0.130            -0.118          -0.176           0.0213
##  5          -0.851             0.0999         -0.969          -0.949 
##  6          -0.625            -0.107          -0.860          -0.442 
##  7           0.861            -0.0905         -0.176          -0.150 
##  8           1.70              0.645          -0.176          -1.08  
##  9          -1.25             -1.03           -0.176          -0.150 
## 10           3.09             -0.0905         -0.176          -0.150 
## # … with 104 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
ranger_pollution <- 
  rand_forest(mtry = tune(), 
              min_n = tune(), 
              trees = 200) %>% 
  set_mode("regression") %>% 
  set_engine("ranger")
pollution_rfworkflow <- 
  workflow() %>% 
  add_recipe(pollution_rfrecipe) %>% 
  add_model(ranger_pollution) 
# Set up penalty grid model 
rf_penalty_grid <- grid_regular(finalize(mtry(),
                                         pollution_training %>%
                                           select(-log_alzheimers)),
                                min_n(),
                                levels = 3)
set.seed(456) 

pollution_cv <- vfold_cv(pollution_training, v = 5) 

# Tune model
pollution_rfTUNE <- 
  pollution_rfworkflow %>% 
  tune_grid(
    resamples = pollution_cv,
    grid = rf_penalty_grid,
    control = ctrl_grid
  )

best_rmse <- 
  pollution_rfTUNE %>%
  select_best(metric = "rmse")

best_rmse
## # A tibble: 1 × 3
##    mtry min_n .config             
##   <int> <int> <chr>               
## 1     1     2 Preprocessor1_Model1
pol_rfFinal <- pollution_rfworkflow %>%
  finalize_workflow(best_rmse) %>%
  fit(data = pollution_training)
# OOB error (MSE)
pol_rfFinal$fit$fit$fit$prediction.error
## [1] 9.722797e-09
# OOB RMSE
sqrt(pol_rfFinal$fit$fit$fit$prediction.error)
## [1] 9.860424e-05
set.seed(456)

metric <- metric_set(rmse) 
ctrl_res <- control_stack_resamples() 

pollution_rfcv <- pollution_rfworkflow %>%
  fit_resamples(pollution_cv, 
               metrics = metric, 
               control = ctrl_res) 

collect_metrics(pollution_rfTUNE)
## # A tibble: 18 × 8
##     mtry min_n .metric .estimator      mean     n    std_err .config            
##    <int> <int> <chr>   <chr>          <dbl> <int>      <dbl> <chr>              
##  1     1     2 rmse    standard   0.0000973     5 0.00000979 Preprocessor1_Mode…
##  2     1     2 rsq     standard   0.725         5 0.0766     Preprocessor1_Mode…
##  3    32     2 rmse    standard   0.000103      5 0.0000133  Preprocessor1_Mode…
##  4    32     2 rsq     standard   0.626         5 0.115      Preprocessor1_Mode…
##  5    63     2 rmse    standard   0.0000996     5 0.0000133  Preprocessor1_Mode…
##  6    63     2 rsq     standard   0.651         5 0.116      Preprocessor1_Mode…
##  7     1    21 rmse    standard   0.000123      5 0.0000102  Preprocessor1_Mode…
##  8     1    21 rsq     standard   0.615         5 0.0784     Preprocessor1_Mode…
##  9    32    21 rmse    standard   0.000113      5 0.0000147  Preprocessor1_Mode…
## 10    32    21 rsq     standard   0.560         5 0.116      Preprocessor1_Mode…
## 11    63    21 rmse    standard   0.000115      5 0.0000146  Preprocessor1_Mode…
## 12    63    21 rsq     standard   0.551         5 0.119      Preprocessor1_Mode…
## 13     1    40 rmse    standard   0.000136      5 0.0000105  Preprocessor1_Mode…
## 14     1    40 rsq     standard   0.486         5 0.0754     Preprocessor1_Mode…
## 15    32    40 rmse    standard   0.000131      5 0.0000145  Preprocessor1_Mode…
## 16    32    40 rsq     standard   0.406         5 0.115      Preprocessor1_Mode…
## 17    63    40 rmse    standard   0.000130      5 0.0000132  Preprocessor1_Mode…
## 18    63    40 rsq     standard   0.422         5 0.105      Preprocessor1_Mode…

Stacking

pollution_stack <- 
  stacks() %>% 
  add_candidates(boost_tune) %>%
  add_candidates(pollution_lasso_tune) %>% 
  add_candidates(pollution_rfTUNE)

as_tibble(pollution_stack)
## # A tibble: 114 × 26
##    log_alzheimers boost_tune_1_01 boost_tune_1_04 boost_tune_1_05
##             <dbl>           <dbl>           <dbl>           <dbl>
##  1       0.000394             0.5           0.500           0.499
##  2       0.000458             0.5           0.500           0.499
##  3       0.000466             0.5           0.500           0.499
##  4       0.000787             0.5           0.500           0.499
##  5       0.000698             0.5           0.500           0.499
##  6       0.000489             0.5           0.500           0.499
##  7       0.000429             0.5           0.500           0.499
##  8       0.000405             0.5           0.500           0.499
##  9       0.000632             0.5           0.500           0.499
## 10       0.000429             0.5           0.500           0.499
## # … with 104 more rows, and 22 more variables: boost_tune_1_06 <dbl>,
## #   boost_tune_1_07 <dbl>, boost_tune_1_08 <dbl>, boost_tune_1_09 <dbl>,
## #   boost_tune_1_10 <dbl>, pollution_lasso_tune_1_01 <dbl>,
## #   pollution_lasso_tune_1_06 <dbl>, pollution_lasso_tune_1_07 <dbl>,
## #   pollution_lasso_tune_1_08 <dbl>, pollution_lasso_tune_1_09 <dbl>,
## #   pollution_lasso_tune_1_10 <dbl>, pollution_lasso_tune_1_11 <dbl>,
## #   pollution_lasso_tune_1_12 <dbl>, pollution_rfTUNE_1_1 <dbl>, …
set.seed(456)

pollution_blend <- 
  pollution_stack %>% 
  blend_predictions()

pollution_blend
## # A tibble: 4 × 3
##   member                    type         weight
##   <chr>                     <chr>         <dbl>
## 1 boost_tune_1_05           boost_tree  2.56   
## 2 pollution_rfTUNE_1_1      rand_forest 1.37   
## 3 boost_tune_1_07           boost_tree  1.36   
## 4 pollution_lasso_tune_1_09 linear_reg  0.00602
pollution_blend$metrics %>% 
  filter(.metric == "rmse")
## # A tibble: 6 × 8
##    penalty mixture .metric .estimator      mean     n    std_err .config        
##      <dbl>   <dbl> <chr>   <chr>          <dbl> <int>      <dbl> <chr>          
## 1 0.000001       1 rmse    standard   0.0000930    25 0.00000253 Preprocessor1_…
## 2 0.00001        1 rmse    standard   0.0000918    25 0.00000227 Preprocessor1_…
## 3 0.0001         1 rmse    standard   0.000132     25 0.00000428 Preprocessor1_…
## 4 0.001          1 rmse    standard   0.000157     25 0.00000370 Preprocessor1_…
## 5 0.01           1 rmse    standard   0.000157     25 0.00000370 Preprocessor1_…
## 6 0.1            1 rmse    standard   0.000157     25 0.00000370 Preprocessor1_…
autoplot(pollution_blend)

autoplot(pollution_blend, type = "members")

autoplot(pollution_blend, type = "weights")

pollution_final_stack <- pollution_blend %>% 
  fit_members()

pollution_final_stack
## # A tibble: 4 × 3
##   member                    type         weight
##   <chr>                     <chr>         <dbl>
## 1 boost_tune_1_05           boost_tree  2.56   
## 2 pollution_rfTUNE_1_1      rand_forest 1.37   
## 3 boost_tune_1_07           boost_tree  1.36   
## 4 pollution_lasso_tune_1_09 linear_reg  0.00602
pollution_final_stack %>% 
  predict(new_data = pollution_testing) %>% 
  bind_cols(pollution_testing) %>% 
  ggplot(aes(x = log_alzheimers, 
             y = .pred)) +
  geom_point(alpha = .5, 
             size = .5) +
  geom_smooth(se = FALSE) +
  geom_abline(slope = 1, 
              intercept = 0, 
              color = "darkred") +
  labs(x = "Actual log(alzheimer's disease)", 
       y = "Predicted log(alzheimer's disease)")

library(Metrics)
data_rmse <- pollution_final_stack %>% 
  predict(new_data = pollution_testing) %>% 
  bind_cols(pollution_testing) 

rmse(data_rmse$log_alzheimers, data_rmse$.pred)
## [1] 0.0001533423
write_rds(pollution_final_stack, "alzheimers_model.rds")