Data Extraction for preselected commodities portfolio

Author

Rodrigo Hermont Ozon, Érick Oliveira Rodrigues

Published

October 7, 2024

Abstract

This small document have the goal to share the time series extraction and the two basic features building, like price returns and their conditional variance…



Intro

[… to be written …]


Python codes

Code

#import yfinance as yf
from yahooquery import Ticker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from arch import arch_model
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotnine import ggplot, aes, geom_line, facet_wrap, labs, theme, element_text, theme_minimal

The portfolio contains the following commodities price returns:

  • Corn Futures
  • Wheat Futures
  • KC HRW Wheat Futures
  • Rough Rice Futures
  • Feeder Cattle Futures
  • SoyMeal Futures
  • Soy Meal Futures
  • SoyBeans Futures
Code
# Tickers for portfolio
TICKERS = [
    "ZC=F",  # Corn Futures
    "ZO=F",  # Wheat Futures
    "KE=F",  # KC HRW Wheat Futures
    "ZR=F",  # Rough Rice Futures
    "GF=F",  # Feeder Cattle Futures
    "ZS=F",  # SoyMeal Futures
    "ZM=F",  # Soybean Meal Futures
    "ZL=F"   # SoyBeans Futures
]

# Downloading data from Yahoo Finance using yahooquery
start_date = "2019-01-01"
end_date = datetime.today().strftime('%Y-%m-%d')

ticker_data = Ticker(TICKERS)
portfolio_prices = ticker_data.history(start=start_date, end=end_date)
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
C:\Users\rodri\AppData\Local\Programs\Python\PYTHON~3\Lib\site-packages\yahooquery\utils\__init__.py:1470: FutureWarning: 'S' is deprecated and will be removed in a future version. Please use 's' instead of 'S'.
Code

# Check if 'adjclose' exists in the returned data
if isinstance(portfolio_prices, pd.DataFrame) and 'adjclose' in portfolio_prices.columns:
    portfolio_prices = portfolio_prices[['adjclose']].reset_index()
else:
    raise KeyError("The data fetched does not contain 'adjclose'. Check Yahoo Finance for availability.")

# Pivot the DataFrame to have a similar format to yfinance output
portfolio_prices = portfolio_prices.pivot(index='date', columns='symbol', values='adjclose')
portfolio_prices.index = pd.to_datetime(portfolio_prices.index)
portfolio_prices.dropna(inplace=True)

# Ensure the index is properly named
portfolio_prices.index.name = "Date"

# Renaming columns for better readability
portfolio_prices.columns = [
    "corn_fut",
    "wheat_fut",
    "KCWheat_fut",
    "rice_fut",
    "Feeder_Cattle",
    "soymeal_fut",
    "soyF_fut",
    "soybeans_fut"
]

Showing the prices time series side by side: (data in level)

Code
from plotnine import ggplot, aes, geom_line, facet_wrap, labs, theme, element_text, theme_minimal, theme_void

# Ensure column name consistency
portfolio_prices_long = portfolio_prices.reset_index().melt(id_vars='Date', var_name='Commodity', value_name='Price')

# Function for visualization
def plot_with_ggplot(data, title, ylabel, background='white', fig_height=10, fig_width=10):
    # Create plot using plotnine (ggplot)
    p = (ggplot(data, aes(x='Date', y='Price', color='Commodity')) +
         geom_line() +
         facet_wrap('~Commodity', ncol=1, scales='free_y') +  # Stack plots vertically
         labs(title=title, x='Date', y=ylabel) +
         theme_minimal() +  # Set minimal theme
         theme(
             figure_size=(fig_width, fig_height),
             panel_background=element_text(fill=background),
             plot_background=element_text(fill=background),
             axis_text_x=element_text(rotation=45, hjust=1),
             subplots_adjust={'wspace': 0.25, 'hspace': 0.5}  # Adjust subplot spacing
         ))
    return p

p_prices = plot_with_ggplot(portfolio_prices_long, 'Commodity Prices Over Time', 'Price', background='white', fig_height=14, fig_width=8)
p_prices
<plotnine.ggplot.ggplot object at 0x000001CBE230AA50>

Obtain the returns time series (first feature):

\[ \mbox{Price log returns}_t = ln(p_t) - ln(p_{t-1}) \]

Code

# Calculate log returns
portfolio_log_returns = np.log(portfolio_prices / portfolio_prices.shift(1)).dropna()
portfolio_log_returns.columns = [
    "ret_corn_fut",
    "ret_wheat_fut",
    "ret_KCWheat_fut",
    "ret_rice_fut",
    "ret_Feeder_Cattle",
    "ret_soymeal_fut",
    "ret_soyF_fut",
    "ret_soybeans_fut"
]

And plot it:

Code
# Preparar os dados no formato long para os log-retornos
portfolio_log_returns_long = portfolio_log_returns.reset_index().melt(id_vars='Date', var_name='Commodity', value_name='Log Return')

def plot_log_returns_with_ggplot(data, title, ylabel, background='white', fig_height=10, fig_width=10):
    # Cria o gráfico usando plotnine (ggplot)
    p = (ggplot(data, aes(x='Date', y='Log Return', color='Commodity')) +
         geom_line() +
         facet_wrap('~Commodity', ncol=1, scales='free_y') +  # Um gráfico em cima do outro
         labs(title=title, x='Date', y=ylabel) +
         theme_minimal() +  # Define o tema minimalista com fundo branco
         theme(
             figure_size=(fig_width, fig_height),  # Ajuste da altura e largura da figura
             panel_background=element_text(fill=background),
             plot_background=element_text(fill=background),
             axis_text_x=element_text(rotation=45, hjust=1),
             subplots_adjust={'wspace': 0.25, 'hspace': 0.5}  # Ajuste do espaçamento entre os gráficos
         ))
    return p

p_log_returns = plot_log_returns_with_ggplot(portfolio_log_returns_long, 'Log Returns of Commodities Over Time', 'Log Return', background='white', fig_height=12, fig_width=8)

# Exibir o gráfico
p_log_returns
<plotnine.ggplot.ggplot object at 0x000001CBE4804250>

As risk measure, we use the conditional variances (volatilities), to deal better with day by day of the prices log-returns.

The GARCH(1,1) model with an asymmetric Student-t distribution is not directly available in most Python libraries. However, we can still use a GARCH(1,1) model with a standard Student-t distribution to estimate the conditional variance. The GARCH(1,1) model is represented as follows:

\[ r_t = \mu + \epsilon_t \]

\[ \epsilon_t = \sigma_t z_t, \quad z_t \sim t_{\nu}(0, 1) \]

\[ \sigma_t^2 = \omega + \alpha \epsilon_{t-1}^2 + \beta \sigma_{t-1}^2 \]

Where:

  • \(r_t\) is the log-return at time \(t\).
  • \(\mu\) is the mean of the returns.
  • \(\epsilon_t\) is the error term, modeled as conditional on past information.
  • \(\sigma_t^2\) is the conditional variance at time \(t\).
  • \(\omega, \alpha, \beta\) are the parameters to be estimated, with \(\omega > 0, \alpha \geq 0, \beta \geq 0\).
  • \(z_t\) follows a Student-t distribution with \(\nu\) degrees of freedom to capture the heavy tails observed in financial returns.
Code
# Initialize an empty DataFrame to store conditional variances
cond_variances = pd.DataFrame(index=portfolio_log_returns.index, columns=portfolio_log_returns.columns)

# Loop through each commodity's log-returns and fit a GARCH(1,1) model
for col in portfolio_log_returns.columns:
    # Fit a GARCH(1,1) model with a Student-t distribution for each series of log returns
    model = arch_model(portfolio_log_returns[col], vol='Garch', p=1, q=1, dist='t')
    res = model.fit(disp='off')
    
    # Extract conditional variances and store them in the DataFrame
    cond_variances[col] = res.conditional_volatility

# Show the first few rows of the conditional variances DataFrame
cond_variances.head()
            ret_corn_fut  ret_wheat_fut  ...  ret_soyF_fut  ret_soybeans_fut
Date                                     ...                                
2019-01-03      0.038268       0.042802  ...      0.087175          0.007952
2019-01-04      0.053450       0.048330  ...      0.104786          7.767961
2019-01-07      0.065066       0.045604  ...      0.111780          7.954770
2019-01-08      0.074834       0.045535  ...      0.114990          7.753064
2019-01-09      0.083385       0.045063  ...      0.116232          7.695460

[5 rows x 8 columns]

and visualizing them:

Code
# Preparar os dados no formato long para as variâncias condicionais
cond_variances_long = cond_variances.reset_index().melt(id_vars='Date', var_name='Commodity', value_name='Conditional Variance')

# Função para criar o gráfico com fundo branco ou transparente e ajustar o tamanho da figura
def plot_cond_variances_with_ggplot(data, title, ylabel, background='white', fig_height=10, fig_width=10):
    # Cria o gráfico usando plotnine (ggplot)
    p = (ggplot(data, aes(x='Date', y='Conditional Variance', color='Commodity')) +
         geom_line() +
         facet_wrap('~Commodity', ncol=1, scales='free_y') +  # Um gráfico em cima do outro
         labs(title=title, x='Date', y=ylabel) +
         theme_minimal() +  # Define o tema minimalista com fundo branco
         theme(
             figure_size=(fig_width, fig_height),  # Ajuste da altura e largura da figura
             panel_background=element_text(fill=background),
             plot_background=element_text(fill=background),
             axis_text_x=element_text(rotation=45, hjust=1),
             subplots_adjust={'wspace': 0.25, 'hspace': 0.5}  # Ajuste do espaçamento entre os gráficos
         ))
    return p

# Exemplo de uso para as variâncias condicionais das commodities
p_cond_variances = plot_cond_variances_with_ggplot(cond_variances_long, 'Conditional Variances Over Time (GARCH(1,1))', 'Conditional Variance', background='white', fig_height=12, fig_width=8)

p_cond_variances
<plotnine.ggplot.ggplot object at 0x000001CBE4262CD0>

R codes

Code
library(tidyverse)
library(dplyr)
library(ggplot2)
#library(plotly)
library(rugarch)
library(timeSeries)
library(fPortfolio)
library(quantmod)
library(caTools)
library(PerformanceAnalytics)
library(MASS)
library(PortfolioAnalytics)
library(ROI)
require(ROI.plugin.glpk)
require(ROI.plugin.quadprog)
library(quadprog)
library(corpcor)
library(DEoptim)
library(cowplot) # devtools::install_github("wilkelab/cowplot/")
library(lattice)
library(timetk)

Loading time series data, for portfolio setting…

Code
tickers <- c(
         "ZC=F", # Corn Futures
         "ZO=F", # Wheat Futures
         "KE=F", # Futuros KC HRW Wheat Futures
         "ZR=F", # Rough Rice Futures
         "GF=F", # Feeder Cattle Futures
         "ZS=F", # SoyMeal Futures 
         "ZM=F", # Futuros farelo soja
         "ZL=F"  # SoyBeans Futures
)

Obtain daily prices and their returns:

Code
portfolioPrices <- NULL
  for ( Ticker in tickers )
    portfolioPrices <- cbind(
      portfolioPrices, 
      getSymbols.yahoo(
        Ticker,
        from = "2019-01-01",
        auto.assign = FALSE
      )[,4]
    )

portfolioPrices <- portfolioPrices[apply(portfolioPrices, 1, function(x) all(!is.na(x))),]

colnames(portfolioPrices) <- c(
  "corn_fut",
  "wheat_fut",
  "KCWheat_fut",
  "rice_fut",
  "Feeder_Cattle",
  "soymeal_fut",
  "soyF_fut",
  "soybeans_fut"
)

tail(portfolioPrices)
           corn_fut wheat_fut KCWheat_fut rice_fut Feeder_Cattle soymeal_fut
2025-02-28   453.50    360.25      558.25   1328.5       274.975     1011.50
2025-03-03   440.25    353.50      547.50   1334.5       274.025      998.25
2025-03-04   436.00    374.75      534.00   1328.0       273.850      984.00
2025-03-05   440.25    370.00      542.25   1306.0       276.100      997.75
2025-03-06   449.50    366.25      551.50   1289.0       274.025     1014.00
2025-03-07   455.25    360.50      551.25   1324.5       276.975     1010.25
           soyF_fut soybeans_fut
2025-02-28    291.7        43.53
2025-03-03    290.1        42.90
2025-03-04    285.9        42.27
2025-03-05    293.0        42.44
2025-03-06    297.1        42.60
2025-03-07    296.5        42.87

Plotting the time series prices (in level):

Code
portfolioPrices |> as.data.frame() |>
  mutate(
    time = seq_along( corn_fut )
  ) |>
  pivot_longer(
    !time,
    names_to = "Variables",
    values_to = "Value"  
      ) |>
  group_by(Variables) |>
  plot_time_series(
    time,
    Value,
    .interactive = F, # Change for TRUE for better visualization
    .facet_ncol = 2,
    .smooth = FALSE
  ) +
  theme(
    strip.background = element_rect(fill = "white", colour = "white")
  )

Obtain the returns time series (first feature):

\[ \mbox{Price log returns}_t = ln(p_t) - ln(p_{t-1}) \]

Code
# Calculate log returns for the portfolio prices
portfolioReturs <- na.omit(diff(log(portfolioPrices))) |> as.data.frame()

colnames(portfolioReturs) <- c(
  "ret_corn_fut",
  "ret_wheat_fut",
  "ret_KCWheat_fut",
  "ret_rice_fut",
  "ret_Feeder_Cattle",
  "ret_soymeal_fut",
  "ret_soyF_fut",
  "ret_soybeans_fut"
)

glimpse(portfolioReturs)
Rows: 1,554
Columns: 8
$ ret_corn_fut      <dbl> 0.0105891128, 0.0085218477, -0.0019601444, -0.005903…
$ ret_wheat_fut     <dbl> 0.0008980692, 0.0053715438, -0.0017873106, 0.0115608…
$ ret_KCWheat_fut   <dbl> 0.0220892515, 0.0049529571, -0.0059464992, 0.0039682…
$ ret_rice_fut      <dbl> 0.0083930387, 0.0053934919, 0.0174507579, 0.00813596…
$ ret_Feeder_Cattle <dbl> -0.0096783375, -0.0111522135, 0.0075628145, 0.011068…
$ ret_soymeal_fut   <dbl> 0.0061281529, 0.0102224954, 0.0030190774, -0.0065988…
$ ret_soyF_fut      <dbl> 0.0054513913, 0.0076457646, 0.0097900861, -0.0018874…
$ ret_soybeans_fut  <dbl> 0.0099858421, 0.0081286732, -0.0052938051, -0.002834…
Code
#portfolioReturs <- as.timeSeries(portfolioReturs)

Plot all time series and their returns:

Code
portfolioReturs |> 
  mutate(
    time = seq_along( ret_corn_fut )
  ) |>
  pivot_longer(
    !time,
    names_to = "Variables",
    values_to = "Value"  
      ) |>
  group_by(Variables) |>
  plot_time_series(
    time,
    Value,
    .interactive = F, # Change for TRUE for better visualization
    .facet_ncol = 2,
    .smooth = FALSE
  ) +
  theme(
    strip.background = element_rect(fill = "white", colour = "white")
  )

Plotting the histograms:

Code
portfolioPrices_df <- as_tibble(portfolioPrices, rownames = "date")
portfolioPrices_df$date <- ymd(portfolioPrices_df$date)

portfolioReturs_df <- na.omit( ROC( portfolioPrices ), type = "discrete" ) |>
  as_tibble(rownames = "date")
portfolioReturs_df$date <- ymd(portfolioReturs_df$date)
colnames(portfolioReturs_df) <- c(
  "date",
  "ret_corn_fut",
  "ret_wheat_fut",
  "ret_KCWheat_fut",
  "ret_rice_fut",
  "ret_Feeder_Cattle",
  "ret_soymeal_fut",
  "ret_soyF_fut",
  "ret_soybeans_fut"
)

# Remover a coluna com nome NA
portfolioReturs_df <- portfolioReturs_df[, !is.na(colnames(portfolioReturs_df))]

# Verificar novamente os nomes das colunas para garantir que estão corretos
colnames(portfolioReturs_df)
[1] "date"              "ret_corn_fut"      "ret_wheat_fut"    
[4] "ret_KCWheat_fut"   "ret_rice_fut"      "ret_Feeder_Cattle"
[7] "ret_soymeal_fut"   "ret_soyF_fut"      "ret_soybeans_fut" 
Code
portfolioReturs_long <- portfolioReturs_df |> 
  pivot_longer(
    cols = -date, # Exclui a coluna de data
    names_to = "fut_type", 
    values_to = "returns"
  )

ggplot(portfolioReturs_long, aes(x = returns)) + 
  geom_histogram(aes(y = ..density..), binwidth = .01, color = "black", fill = "white") +
  geom_density(alpha = .2, fill="lightgray") +
  theme_minimal() +
  theme(
    axis.line  = element_line(colour = "black"),
    axis.text  = element_text(colour = "black"),  
    axis.ticks = element_line(colour = "black"), 
    legend.position = c(.1,.9), 
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank()
  ) +
  theme(plot.title   = element_text(size = 10),  
        axis.title.x = element_text(size = 7), 
        axis.title.y = element_text(size = 7)) + 
  labs(x = "Returns", y = "Density") +
  facet_wrap(~fut_type, scales = "free", ncol = 2) 

And finnaly, the last feature, is called, the conditional variance (risk measure), obtained by GARCH(1,1) model, formalized as:

The GARCH(1,1) model with asymmetric Student-t distribution can be represented mathematically as:

\[ r_t = \mu + \epsilon_t \]

\[ \epsilon_t = \sigma_t z_t, \quad z_t \sim t_{\nu}(0, 1) \]

\[ \sigma_t^2 = \omega + \alpha \epsilon_{t-1}^2 + \beta \sigma_{t-1}^2 \]

Where:

  • \(r_t\) is the return at time \(t\).
  • \(\mu\) is the mean of the returns.
  • \(\epsilon_t\) is the error term, modeled as conditional on past information.
  • \(\sigma_t^2\) is the conditional variance at time \(t\).
  • \(\omega, \alpha, \beta\) are the parameters to be estimated, with \(\omega > 0, \alpha \geq 0, \beta \geq 0\).
  • \(z_t\) follows an asymmetric Student-t distribution with \(\nu\) degrees of freedom to better capture the heavy tails and skewness observed in financial returns.
Code
# Load necessary packages
library(rugarch)

# Define the GARCH(1,1) model specification with Student-t distribution
spec <- ugarchspec(
  variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
  mean.model = list(armaOrder = c(0, 0), include.mean = TRUE),
  distribution.model = "std" # Using Student-t distribution
)

# Estimate the model for each asset in the portfolio and extract conditional variances
garch_models <- list()
conditional_variances <- list()

for (i in colnames(portfolioReturs)) {
  garch_models[[i]] <- ugarchfit(spec, data = portfolioReturs[[i]])
  conditional_variances[[i]] <- sigma(garch_models[[i]])^2
}

# Convert conditional variances list to a data frame
conditional_variances_df <- do.call(cbind, conditional_variances) %>%
  as.data.frame() %>%
  mutate(time = seq_along(conditional_variances[[1]]))

colnames(conditional_variances_df) <- c(
  "cond_var_corn_fut",
  "cond_var_wheat_fut",
  "cond_var_KCWheat_fut",
  "cond_var_rice_fut",
  "cond_var_Feeder_Cattle",
  "cond_var_soymeal_fut",
  "cond_var_soyF_fut",
  "cond_var_soybeans_fut",
  "time"
)

# Reshape data for plotting
conditional_variances_long <- conditional_variances_df %>%
  pivot_longer(!time, names_to = "Variables", values_to = "Value")

And the plot of the conditional variance (risk):

Code
conditional_variances_long |> 
  group_by(Variables) |>
  plot_time_series(
    time,
    Value,
    .interactive = F, # Change for TRUE for better visualization
    .facet_ncol = 2,
    .smooth = FALSE
  ) +
  theme(
    strip.background = element_rect(fill = "white", colour = "white")
  )

 

 


References

Gujarati, D., N. (2004) Basic Econometrics, fourth edition, The McGraw−Hill Companies

Hair, J. F., Black, W. C., Babin, B. J., & Anderson, R. E. (2019). Multivariate Data Analysis. Pearson.

Hyndman, R.J., & Athanasopoulos, G. (2021) Forecasting: principles and practice, 3rd edition, OTexts: Melbourne, Australia. OTexts.com/fpp3. Accessed on oct 2023.

 

 


Code
# Total timing to compile this Quarto document

end_time = datetime.now()
time_diff = end_time - start_time

print(f"Total Quarto document compiling time: {time_diff}")
Total Quarto document compiling time: 0:00:56.102477