Comparing Many Models: Crude Oil Futures Price

Crude prices are declining over sharp revision to U.S. employment data which leads to concerns about slowing demand. U.S. Crude oil futures price has fluctuated above the point forecast line (XGBoost) year to date.

Source code:

library(tidyverse)
library(tidymodels)
library(tidyquant)
library(timetk)
library(modeltime)
library(workflowsets)


#Crude Oil Futures(USD) (Index 2014 = 100)
df_crude_oil <- 
  tq_get("CL=F") %>% 
  tq_transmute(select = close,
               mutate_fun = to.monthly,
               col_rename = "crude_oil") %>% 
  mutate(date = as.Date(date))


#Federal Funds Effective Rate
df_fed_funds <- 
  tq_get("FEDFUNDS", get = "economic.data") %>% 
  select(date, fed_funds = price)

#Industrial Production: Mining: Crude Oil (NAICS = 21112) (Index 2014 = 100)
df_crude_oil_production <- 
  tq_get("IPG21112S", get = "economic.data") %>% 
  select(date, crude_oil_production = price) 

#Merging all the data sets
df_merged <- 
  df_crude_oil %>%
  left_join(df_fed_funds) %>% 
  left_join(df_crude_oil_production) %>% 
  drop_na()


#Splitting tha data
df_split <- 
  df_merged %>% 
  time_series_split(assess = "1 year",
                    cumulative = TRUE)

df_train <- training(df_split)
df_test <- testing(df_split)

#Bootstrapping for tuning
set.seed(12345)
df_folds <- bootstraps(df_train,
                       times = 100)


#Preprocessing
rec_all <- 
  recipe(crude_oil ~ ., data = df_train) %>% 
  step_mutate(date_num = as.numeric(date)) %>% 
  step_rm(date) %>% 
  step_normalize(all_numeric_predictors()) 
  

#Models

#Radial basis function support vector machines (SVMs) via kernlab
mod_svm_rbf <- 
  svm_rbf(cost = tune(),
          rbf_sigma = tune(), 
          margin = tune()) %>%  
  set_engine("kernlab") %>% 
  set_mode("regression")


#Multivariate adaptive regression splines (MARS) via earth
mod_mars <- 
  mars(num_terms = tune(), 
       prune_method = tune()) %>% 
  set_engine("earth", nfold = 10) %>% 
  set_mode("regression") 


#Boosted trees via xgboost
mod_boost_tree <- 
  boost_tree(mtry = tune(), 
             trees = tune(), 
             min_n = tune(), 
             learn_rate = tune()) %>%
  set_engine("xgboost") %>%
  set_mode("regression") 



#Workflow sets
wflow_all <- 
  workflow_set(
    preproc = list(svm_all = rec_all),
    models = list(SVM_rbf = mod_svm_rbf,
                  MARS = mod_mars,
                  XGBoost = mod_boost_tree)) %>% 
  #Making the workflow ID's a little more simple:
  mutate(wflow_id = str_remove(wflow_id, "svm_all_"))



#Tuning and evaluating all the models
grid_ctrl <-
  control_grid(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

grid_results <-
  wflow_all %>%
  workflow_map(
    seed = 98765,
    resamples = df_folds,
    grid = 10,
    control = grid_ctrl
  )


#Accuracy of the grid results
grid_results %>% 
  rank_results(select_best = TRUE, 
               rank_metric = "rsq") %>%
  select(Models = wflow_id, .metric, mean)


#Finalizing the model with the best parameters
best_param <- 
  grid_results %>%
  extract_workflow_set_result("XGBoost") %>% 
  select_best(metric = "rsq")


wflw_fit <- 
  grid_results %>% 
  extract_workflow("XGBoost") %>% 
  finalize_workflow(best_param) %>% 
  fit(df_train)

#Calibration data
df_cal <- 
  wflw_fit %>% 
  modeltime_calibrate(new_data = df_test)

#Predictive intervals for XGBoost
df_cal %>%
  modeltime_forecast(actual_data = df_merged %>% 
                                   filter(date >= last(date) - months(12)),
                     new_data = df_test) %>%
  plot_modeltime_forecast(.interactive = FALSE,
                          .legend_show = FALSE,
                          .line_size = 1,
                          .color_lab = "",
                          .title = "Confidence Intervals for Crude Oil Future Prices (WTI)") +
  labs(subtitle = "Using XGBoost") + 
  theme_minimal(base_family = "Bricolage Grotesque",
                base_size = 16)

Leave a comment

I’m Selcuk Disci

Welcome to DataGeeek.com, dedicated to data science and machine learning with R, mostly based on financial data.

Let’s connect