Machine Learning Model

Description

The goal of this section is to demonstrate whether or not our statistic for wind categorization can be predicted using a machine learning model. We chose a random forest model, as our variable is categorical with more than two choices. In building the model, we tuned the following hyperparameters using cross validation: number of trees, minimum samples required to be at a leaf node, and the number of features to consider at each split in a tree. During this hyperparameter tuning step, which is typically time consuming for large datasets such as ours, we used parallel computing. Future directions may consider different machine learning models to improve accuracy.

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(tidymodels)

Warning: package 'tidymodels' was built under R version 4.4.2

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

✔ broom        1.0.6     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tibble       3.2.1
✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
✔ infer        1.0.7     ✔ tune         1.2.1
✔ modeldata    1.4.0     ✔ workflows    1.1.4
✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
✔ purrr        1.0.2     ✔ yardstick    1.3.1
✔ recipes      1.1.0

Warning: package 'dials' was built under R version 4.4.2

Warning: package 'infer' was built under R version 4.4.2

Warning: package 'modeldata' was built under R version 4.4.2

Warning: package 'parsnip' was built under R version 4.4.2

Warning: package 'recipes' was built under R version 4.4.2

Warning: package 'rsample' was built under R version 4.4.2

Warning: package 'tune' was built under R version 4.4.2

Warning: package 'workflows' was built under R version 4.4.2

Warning: package 'workflowsets' was built under R version 4.4.2

Warning: package 'yardstick' was built under R version 4.4.2

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/

library(ranger)

Warning: package 'ranger' was built under R version 4.4.2

library(parallel)
library(future)

Warning: package 'future' was built under R version 4.4.2

Preparing the Data

# Read in data
ML_data <- read.csv("ML_wind_data.csv")

# Remove variables in aggregate statistic
ML_data <- ML_data %>% select(-INJURIES_DIRECT, -INJURIES_INDIRECT, -DEATHS_DIRECT, -DEATHS_INDIRECT, -DAMAGE_PROPERTY, -DAMAGE_CROPS, -aggregate_stat)

# Prepare remaining data for analysis -- make sure character strings are factors, etc.
ML_data <- ML_data %>% mutate(STATE = as.factor(STATE), YEAR = as.factor(YEAR), MONTH_NAME = as.factor(MONTH_NAME), EVENT_TYPE = as.factor(EVENT_TYPE), MAGNITUDE_TYPE = as.factor(MAGNITUDE_TYPE), scaled_aggregate = as.factor(scaled_aggregate))

ML_data <- na.omit(ML_data)
head(ML_data)

  EVENT_ID          STATE YEAR MONTH_NAME               EVENT_TYPE MAGNITUDE
1    67398       ARKANSAS 2008    January        Thunderstorm Wind        50
2   143292       ILLINOIS 2008   December        Thunderstorm Wind        65
3   143294  LAKE MICHIGAN 2008   December Marine Thunderstorm Wind        39
4   143297        INDIANA 2008   December        Thunderstorm Wind        52
5    74159 SOUTH CAROLINA 2008      March        Thunderstorm Wind        55
6    74161 SOUTH CAROLINA 2008      March        Thunderstorm Wind        50
  MAGNITUDE_TYPE storm_time_duration change_lon change_lat scaled_aggregate
1             EG                   0       0.00       0.00                0
2             EG                   0       0.00       0.00                0
3             MG                   0       0.00       0.00                0
4             EG                   0       0.00       0.00                0
5             EG                   0       0.00       0.00                0
6             EG                 720       0.18       0.05                0

Split the Data and Define the Model

# Split data into training/test sets
set.seed(1234)
data_split <- initial_split(ML_data, prop = 3/4)
train_data <- training(data_split)
test_data  <- testing(data_split)

# Define Recipe
ML_recipe <- recipe(scaled_aggregate ~ ., data = train_data) %>% update_role(EVENT_ID, new_role = "ID")

# Define Random Forest Model Specs
model_spec <- rand_forest(mtry = tune(), trees = tune(), min_n = tune()) %>%
  set_mode("classification") %>%
  set_engine("ranger", importance = "permutation")

# Create Workflow
ML_workflow <- workflow() %>%
  add_recipe(ML_recipe) %>%
  add_model(model_spec)

Hyperparameter Tuning (with parallel computing)

# Set up parallel computing
numCores <- detectCores()
plan(multisession, workers = numCores)

# Run hyperparameter tuning
## define cross validation folds
cv_folds <- vfold_cv(train_data, v = 3)

## define hyperparameters to tune
parameter_grid <- grid_regular(
  mtry(range = c(2, 7)),
  trees(range = c(50, 250)),
  min_n(range = c(1, 10))
)

## tune hyperparameters
tuned_results <- ML_workflow %>%
  tune_grid(resamples = cv_folds, grid = parameter_grid) 

tuned_results %>%
  collect_metrics()

# A tibble: 81 × 9
    mtry trees min_n .metric     .estimator  mean     n  std_err .config        
   <int> <int> <int> <chr>       <chr>      <dbl> <int>    <dbl> <chr>          
 1     2    50     1 accuracy    multiclass 0.652     3 0.00371  Preprocessor1_…
 2     2    50     1 brier_class multiclass 0.235     3 0.00183  Preprocessor1_…
 3     2    50     1 roc_auc     hand_till  0.776     3 0.00194  Preprocessor1_…
 4     4    50     1 accuracy    multiclass 0.684     3 0.00299  Preprocessor1_…
 5     4    50     1 brier_class multiclass 0.215     3 0.00203  Preprocessor1_…
 6     4    50     1 roc_auc     hand_till  0.797     3 0.00138  Preprocessor1_…
 7     7    50     1 accuracy    multiclass 0.677     3 0.00297  Preprocessor1_…
 8     7    50     1 brier_class multiclass 0.222     3 0.00222  Preprocessor1_…
 9     7    50     1 roc_auc     hand_till  0.790     3 0.000699 Preprocessor1_…
10     2   150     1 accuracy    multiclass 0.652     3 0.00522  Preprocessor1_…
# ℹ 71 more rows

# Return to work on one core
plan(sequential)

Train and Test Model

# Finalize workflow with best hyperparameters
best_params <- tuned_results %>%
  select_best(metric = "roc_auc")
best_params

# A tibble: 1 × 4
   mtry trees min_n .config              
  <int> <int> <int> <chr>                
1     4   250    10 Preprocessor1_Model26

ML_workflow_final <- ML_workflow %>% finalize_workflow(best_params)

# Train Data
trained_model <- ML_workflow_final %>%
  fit(data = train_data)

# Test data on testing data
test_predict <- predict(trained_model, new_data = test_data) %>%
  bind_cols(test_data)

Evaluating the Model

# Evaluate model
model_eval <- test_predict %>%
  metrics(truth = scaled_aggregate, estimate = .pred_class)

model_eval

# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy multiclass     0.690
2 kap      multiclass     0.459

# Feature Importance
importance <- trained_model %>%
  extract_fit_engine() %>%
  ranger::importance()

importance

              STATE                YEAR          MONTH_NAME          EVENT_TYPE 
        0.130906821         0.059272694         0.029812890         0.069625612 
          MAGNITUDE      MAGNITUDE_TYPE storm_time_duration          change_lon 
        0.133540752         0.036821833         0.059528004         0.014531206 
         change_lat 
        0.008056711

Conclusion

In the end, our model with the best hyperparameters had an accuracy of 0.6898446 and a kappa value – which measures how much better our model is than random chance (on a scale from 0 to 1) – of 0.4593361. The most important features in the model were wind magnitude and the state the event took place in. The least important features were changes in latitude and longitude, but that could be due to the fact that many wind events did not record changes in latitude and longitude. Because our accuracy and kappa values were higher than we would expect by random chance, we have shown that our statistic for wind categorization is predictable and future directions may be pursued.