Time Series Machine Learning: S&P 500

It does not seem to be a safe entry point for the S&P 500 market ahead of the FED rate cuts.

Source code:

library(tidyverse)
library(tidymodels)
library(timetk)
library(tidyquant)
library(modeltime)
library(ggthemes)


#FED Interest Rates
df_fedfunds <- 
  read_csv("https://raw.githubusercontent.com/mesdi/investingcom/main/fedfunds.csv") %>% 
  janitor::clean_names() %>% 
  select(date = release_date, fedfunds = actual) %>% 
  #Converts string to date object
  mutate(date = case_when(
    !is.na(parse_date(date, format = "%b %d, %Y")) ~ parse_date(date, format = "%b %d, %Y"),
    !is.na(parse_date(date, format = "%d-%b-%y")) ~ parse_date(date, format = "%d-%b-%y")
  )) %>% 
  mutate(date = floor_date(date, "month") %m+% months(1),
         fedfunds = str_remove(fedfunds, "%") %>% as.numeric()) %>% 
  #makes regular time series by filling the time gaps
  pad_by_time(date, .by = "month") %>% 
  fill(fedfunds, .direction = "down") %>% 
  #removes duplicated points
  distinct(date, .keep_all = TRUE) %>% 
  drop_na()


#S&P 500 (^GSPC)
df_sp500 <- 
  tq_get("^GSPC", to = "2024-09-01") %>% 
  tq_transmute(select = close,
               mutate_fun = to.monthly,
               col_rename = "sp500") %>% 
  mutate(date = as.Date(date)) %>% 
  drop_na()

#Merging all the datasets
df_merged <- 
  df_sp500 %>% 
  left_join(df_fedfunds) %>% 
  drop_na()


#Splitting
split <- 
  df_merged %>% 
  time_series_split(assess = "1 year", 
                    cumulative = TRUE)

df_train <- training(split)
df_test <- testing(split)

#Time series cross validation for tuning
df_folds <- time_series_cv(df_train,
                           initial = 80, 
                           assess = 12)


#Preprocessing for Boosting ARIMA
rec_arima_boost <- 
  recipe(sp500 ~ ., data = df_train) %>% 
  step_date(date, features = c("year", "month")) %>% 
  step_dummy(date_month, one_hot = TRUE) %>% 
  step_normalize(all_numeric_predictors())



#Boosted ARIMA Regression Models
#(https://business-science.github.io/modeltime/reference/arima_boost.html)
mod_arima_boost <- 
  arima_boost(
    min_n = tune(),
    learn_rate = tune(),
    trees = tune()
  ) %>%
  set_engine(engine = "auto_arima_xgboost")


#Workflow set
wflow_arima_boost <- 
  workflow_set(
    preproc = list(rec = rec_arima_boost),
    models = list(mod = mod_arima_boost)
  ) 


#Tuning and evaluating the model on all the samples
grid_ctrl <-
  control_grid(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

grid_results <-
  wflow_arima_boost %>%
  workflow_map(
    seed = 98765,
    resamples = df_folds,
    grid = 10,
    control = grid_ctrl
  )


#Accuracy of the grid results
grid_results %>% 
  rank_results(select_best = TRUE, 
               rank_metric = "rsq") %>%
  select(Models = wflow_id, .metric, mean)


#Finalizing the model with the best parameters
best_param <- 
  grid_results %>%
  extract_workflow_set_result("rec_mod") %>% 
  select_best(metric = "rsq")


wflw_fit <- 
  grid_results %>% 
  extract_workflow("rec_mod") %>% 
  finalize_workflow(best_param) %>% 
  fit(df_train)


#Calibrate the model to the testing set
calibration_boost <- 
  wflw_fit %>%
  modeltime_calibrate(new_data = df_test)

#Accuracy of the finalized model
calibration_boost %>%
  modeltime_accuracy(metric_set = metric_set(rmse,rsq)) %>% 
  select(.model_desc)


#Predictive intervals
calibration_boost %>%
  modeltime_forecast(actual_data = df_merged %>% 
                       filter(date >= last(date) - months(12)),
                     new_data = df_test) %>%
  plot_modeltime_forecast(.interactive = FALSE,
                          .legend_show = FALSE,
                          .line_size = 1.5,
                          .color_lab = "",
                          .title = "Predictive Intervals for S&P 500 ") +
  labs(subtitle = "Monthly Index<br><span style = 'color:red;'>Point Forecast Line</span>") + 
  scale_x_date(breaks = c(make_date(2023,8,1), 
                          make_date(2024,1,1),
                          make_date(2024,8,1)),
               labels = scales::label_date(format = "%Y %b"),
               expand = expansion(mult = c(.1, .1))) +
  theme_wsj(base_family = "Bricolage Grotesque",
            color = "blue",
            base_size = 12) +
  theme(legend.position = "none",
        plot.background = element_rect(fill = "khaki", color = "khaki"),
        plot.title = element_text(size = 24),
        axis.text = element_text(size = 16),
        plot.subtitle = ggtext::element_markdown(size = 20, face = "bold"))

2 responses to “Time Series Machine Learning: S&P 500”

  1. James Avatar
    James

    Error

    recipe(nvidia ~ ., data = df_train) %>%

    correct

    recipe(sp500 ~ ., data = df_train) %>%

    Liked by 1 person

  2. roaringsurf Avatar

    There is an error in the code that is posted to R-bloggers:

    recipe(sp500 ~ ., data = df_train) %>%

    is

    recipe(nvidia ~ ., data = df_train) %>%

    on R-bloggers

    Like

Leave a reply to James Cancel reply

I’m Selcuk Disci

The DataGeeek focuses on machine learning, deep learning, and Generative AI in data science using financial data for educational and informational purposes.

Let’s connect