Time Series Machine Learning: Shanghai Composite

Shanghai Composite does not seem to be at an ideal point for entry.

Source code:

library(tidyverse)
library(tidyquant)
library(timetk)
library(tidymodels)
library(modeltime)
library(workflowsets)

#Shanghai Composite Index (000001.SS)
df_shanghai <- 
  tq_get("000001.SS", from = "2015-09-01") %>% 
  tq_transmute(select = close,
               mutate_fun = to.monthly,
               col_rename = "sse") %>% 
  mutate(date = as.Date(date)) 

#Splitting
split <- 
  df_shanghai %>% 
  time_series_split(assess = "1 year", 
                    cumulative = TRUE)

df_train <- training(split)
df_test <- testing(split)

#Time series cross validation for tuning
df_folds <- time_series_cv(df_train,
                           initial = 77, 
                           assess = 12)


#Preprocessing 
rec <- 
  recipe(sse ~ date, data = df_train) %>% 
  step_mutate(date_num = as.numeric(date)) %>% 
  step_date(date, features = "month") %>% 
  step_dummy(date_month, one_hot = TRUE) %>% 
  step_normalize(all_numeric_predictors())


rec %>% 
  prep() %>% 
  bake(new_data = NULL) %>% view()


#Model
mod <- 
  arima_boost(
    min_n = tune(),
    learn_rate = tune(),
    trees = tune()
  ) %>%
  set_engine(engine = "auto_arima_xgboost")

#Workflow set
wflow_mod <- 
  workflow_set(
    preproc = list(rec = rec),
    models = list(mod = mod)
  ) 


#Tuning and evaluating the model on all the samples
grid_ctrl <-
  control_grid(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

grid_results <-
  wflow_mod %>%
  workflow_map(
    seed = 98765,
    resamples = df_folds,
    grid = 10,
    control = grid_ctrl
  )


#Accuracy of the grid results
grid_results %>% 
  rank_results(select_best = TRUE, 
               rank_metric = "rmse") %>%
  select(Models = wflow_id, .metric, mean)


#Finalizing the model with the best parameters
best_param <- 
  grid_results %>%
  extract_workflow_set_result("rec_mod") %>% 
  select_best(metric = "rmse")


wflw_fit <- 
  grid_results %>% 
  extract_workflow("rec_mod") %>% 
  finalize_workflow(best_param) %>% 
  fit(df_train)


#Calibrate the model to the testing set
calibration_boost <- 
  wflw_fit %>%
  modeltime_calibrate(new_data = df_test)

#Accuracy of the finalized model
calibration_boost %>%
  modeltime_accuracy(metric_set = metric_set(mape, smape))



#Predictive intervals
calibration_boost %>%
  modeltime_forecast(actual_data = df_shanghai %>% 
                                   filter(date >= last(date) - months(12)),
                     new_data = df_test) %>%
  plot_modeltime_forecast(.interactive = FALSE,
                          .legend_show = FALSE,
                          .line_size = 1.5,
                          .color_lab = "",
                          .title = "Shanghai Composite Index") +
  geom_point(aes(color = .key)) +
  labs(subtitle = "Monthly Data<br><span style = 'color:darkgrey;'>Predictive Intervals</span><br><span style = 'color:red;'>Point Forecast Line</span>") + 
  scale_x_date(breaks = c(make_date(2023,11,1), 
                          make_date(2024,5,1),
                          make_date(2024,10,1)),
               labels = scales::label_date(format = "%b'%y"),
               expand = expansion(mult = c(.1, .1))) +
  ggthemes::theme_wsj(
    base_family = "Roboto Slab",
    title_family = "Roboto Slab",
    color = "blue",
    base_size = 12) +
  theme(legend.position = "none",
        plot.background = element_rect(fill = "lightyellow", color = "lightyellow"),
        plot.title = element_text(size = 24),
        axis.text = element_text(size = 16),
        plot.subtitle = ggtext::element_markdown(size = 20, face = "bold"))

4 responses to “Time Series Machine Learning: Shanghai Composite”

  1. TRANG LE Avatar
    TRANG LE

    Hello,

    Thanks for sharing the code for this project. I’m looking to learn time series with modeltime so this is a fantastic post. When I copied and ran the code, I got an error saying “df_merged is missing”. Could you direct me how to create df_merged?

    Thank you,

    Trang

    Like

    1. Selcuk Disci Avatar

      Thank you for your kind words. I’ve corrected the mistake. Please check again.

      Like

      1. elaek1 Avatar
        elaek1

        Thank you for resolving the issue. Could you advise on how to use machine learning to fill gaps in time series data, specifically for rainfall measurements?

        Like

      2. Selcuk Disci Avatar

        You can write ‘pad_by_time’ on the search box on my site and examine all the articles pop up.

        Liked by 1 person

Leave a comment

I’m Selcuk Disci

Welcome to DataGeeek.com, dedicated to data science and machine learning with R, mostly based on financial data.

Let’s connect