106 how to calculate the cov mat for recent days but not all days cov…

… mats and select recent ones (#179) * iewma to handle nans * funcionality to only solve at some time steps * minor change * edits to handling nan * changed code file in readme file * updated nan implementation
cvxgrp · Mar 25, 2024 · 2f0ed1f · 2f0ed1f
1 parent 7e9011f
commit 2f0ed1f
Show file tree

Hide file tree

Showing 9 changed files with 1,820 additions and 504 deletions.
diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ import pandas as pd
 from cvx.covariance.combination import from_ewmas
 
 # Load return data
-returns = pd.read_csv("data/ff5.csv", index_col=0, header=0, parse_dates=True).iloc[:1000]
+returns = pd.read_csv("data/ff5_no_rf.csv", index_col=0, header=0, parse_dates=True).iloc[:1000]
 n = returns.shape[1]
 
 # Define half-life pairs for K=3 experts, (halflife_vola, halflife_cov)
@@ -107,7 +107,7 @@ import pandas as pd
 from cvx.covariance.combination import from_sigmas
 
 # Load return data
-returns = pd.read_csv("data/ff5.csv", index_col=0,
+returns = pd.read_csv("data/ff5_no_rf.csv", index_col=0,
                       header=0, parse_dates=True).iloc[:1000]
 n = returns.shape[1]
 

diff --git a/cvx/covariance/combination.py b/cvx/covariance/combination.py
@@ -218,15 +218,16 @@ def assets(self):
         """
         return self.returns.columns
 
-    def solve(self, window=None, **kwargs):
+    def solve(self, window=None, times=None, **kwargs):
         """
         The size of the window is crucial to specify the size of the parameters
         for the cvxpy problem. Hence those computations are not in the __init__ method
 
-        Solves the covariance combination problem at a given time, i.e.,
-        finds the prediction for the covariance matrix at 'time+1'
+        Solves the covariance combination problem at a time steps given in times
 
-        param window: number of previous time steps to use in the covariance combination
+        param window: number of previous time steps to use in the covariance
+        combination problem
+        param times: list of time steps to solve the problem at; if None, solve at all available time steps
         """
         # If window is None, use all available data; cap window at length of data
         window = window or len(self.__Ls_shifted)
@@ -240,9 +241,29 @@ def solve(self, window=None, **kwargs):
             }
         )
 
-        Bs = {time: np.column_stack(Lts_at_r.loc[time]) for time in Lts_at_r.index}
+        # time steps to solve the problem at
+        all_available_times = Lts_at_r.index
+
+        if times is None:
+            times = Lts_at_r.index
+        else:
+            times = list(times)
+
+            # assert times are consecutive in all_available_times
+            zero_index = all_available_times.get_loc(times[0])
+            assert list(
+                all_available_times[zero_index : zero_index + len(times)]
+            ) == list(times)
+            "times must be consecutive"
+
+            # add window - 1 previous time steps to times
+            times = (
+                list(all_available_times[zero_index - window + 1 : zero_index]) + times
+            )
+
+        Bs = {time: np.column_stack(Lts_at_r.loc[time]) for time in times}
         prod_Bs = pd.Series({time: B.T @ B for time, B in Bs.items()})
-        times = prod_Bs.index
+        # times = prod_Bs.index
         P = {
             times[i]: sum(prod_Bs.loc[times[i - window + 1] : times[i]])
             for i in range(window - 1, len(times))
@@ -282,9 +303,6 @@ def _solve(self, time, problem, **kwargs):
         """
         problem.solve(**kwargs)
 
-        # if problem.status != "optimal":
-        #     raise cvx.SolverError
-
         weights = problem.weights
 
         # Get non-shifted L

diff --git a/cvx/covariance/ewma.py b/cvx/covariance/ewma.py
@@ -64,7 +64,11 @@ def center(returns, halflife, min_periods=0, mean_adj=False):
     """
     if mean_adj:
         mean = _generator2frame(
-            _ewma_mean(data=returns, halflife=halflife, min_periods=min_periods)
+            _ewma_mean(
+                data=returns,
+                halflife=halflife,
+                min_periods=min_periods,
+            )
         )
         return (returns - mean).dropna(how="all", axis=0), mean
     else:
@@ -94,7 +98,25 @@ def iterated_ewma(
     mu_halflife1=None,
     mu_halflife2=None,
     clip_at=None,
+    nan_to_num=True,
 ):
+    """
+    param returns: pandas dataframe with returns for each asset
+    param vola_halflife: half life for volatility
+    param cov_halflife: half life for covariance
+    param min_preiods_vola: minimum number of periods to use for volatility
+    ewma estimate
+    param min_periods_cov: minimum number of periods to use for covariance
+    ewma estimate
+    param mean: whether to estimate mean; if False, assumes zero mean data
+    param mu_halflife1: half life for the first mean adjustment; if None, it is
+    set to vola_halflife; only applies if mean=True
+    param mu_halflife2: half life for the second mean adjustment; if None, it is
+    set to cov_halflife; only applies if mean=True
+    param clip_at: winsorizes ewma update at +-clip_at*(current ewma) in ewma;
+    if None, no winsorization is performed
+    nan_to_num: if True, replace NaNs in returns with 0.0
+    """
     mu_halflife1 = mu_halflife1 or vola_halflife
     mu_halflife2 = mu_halflife2 or cov_halflife
 
@@ -104,7 +126,13 @@ def scale_cov(vola, matrix):
         matrix = matrix.values
 
         # Convert (covariance) matrix to correlation matrix
-        v = 1 / np.sqrt(np.diagonal(matrix)).reshape(-1, 1)
+        # v = 1 / np.sqrt(np.diagonal(matrix)).reshape(-1, 1)
+        diag = np.diagonal(matrix).copy()
+        one_inds = np.where(diag == 0)[0]
+        if one_inds.size > 0:
+            diag[one_inds] = 1  # temporarily, to avoid division by zero
+        v = 1 / np.sqrt(diag).reshape(-1, 1)
+        v[one_inds] = np.nan
         matrix = v * matrix * v.T
 
         cov = vola.reshape(-1, 1) * matrix * vola.reshape(1, -1)
@@ -114,14 +142,18 @@ def scale_cov(vola, matrix):
     def scale_mean(vola, vec1, vec2):
         return vec1 + vola * vec2
 
+    if nan_to_num:
+        if returns.isna().any().any():
+            returns = returns.fillna(0.0)
+
     # compute the moving mean of the returns
 
-    # TODO: Check if this is correct half life
     returns, returns_mean = center(
         returns=returns, halflife=mu_halflife1, min_periods=0, mean_adj=mean
     )
 
-    # estimate the volatility, clip some returns before they enter the estimation
+    # estimate the volatility, clip some returns before they enter the
+    # estimation
     vola = volatility(
         returns=returns,
         halflife=vola_halflife,
@@ -130,27 +162,26 @@ def scale_mean(vola, vec1, vec2):
     )
 
     # adj the returns
-    adj = clip((returns / vola), clip_at=clip_at)
-
-    # center the adj returns again? Yes, I think so
-    # TODO: Check if this is correct half life
+    # make sure adj is NaN where vola is zero or returns are
+    # NaN. This handles NaNs (which in some sense is the same as a constant
+    # return series, i.e., vola = 0)
+    adj = clip((returns / vola), clip_at=clip_at)  # if vola is zero, adj is NaN
+    adj = adj.fillna(0.0)
 
+    # center the adj returns again
     adj, adj_mean = center(adj, halflife=mu_halflife2, min_periods=0, mean_adj=mean)
-    # if mean:
-    #     print(adj)
-    #     print(adj_mean)
-    #     assert False
-
-    m = pd.Series(np.zeros_like(returns.shape[1]), index=returns.columns)
 
     for t, matrix in _ewma_cov(
-        data=adj, halflife=cov_halflife, min_periods=min_periods_cov
+        data=adj,
+        halflife=cov_halflife,
+        min_periods=min_periods_cov,
     ):
         if mean:
             m = scale_mean(
                 vola=vola.loc[t].values, vec1=returns_mean.loc[t], vec2=adj_mean.loc[t]
             )
-
+        else:
+            m = pd.Series(np.zeros_like(returns.shape[1]), index=returns.columns)
         yield IEWMA(
             time=t,
             mean=m,
@@ -164,6 +195,7 @@ def _ewma_cov(data, halflife, min_periods=0):
     param data: Txn pandas DataFrame of returns
     param halflife: float, halflife of the EWMA
     """
+
     for t, ewma in _general(
         data.values,
         times=data.index,
@@ -191,8 +223,6 @@ def _ewma_mean(data, halflife, min_periods=0, clip_at=None):
 def _general(
     y,
     times,
-    # com: Union[float, None] = None,
-    # span: Union[float, None] = None,
     halflife: float | TimedeltaConvertibleTypes | None = None,
     alpha: float | None = None,
     fct=lambda x: x,
@@ -202,12 +232,12 @@ def _general(
     """
     y: frame with measurements for times t=t_1,t_2,...,T
     halflife: EWMA half life
+    alpha: EWMA alpha
+    fct: function to apply to y
+    min_periods: minimum number of observations to start EWMA
+    clip_at: clip y_last at  +- clip_at*EWMA (optional)
 
-    returns: list of EWMAs for times t=t_1,t_2,...,T
-    serving as predictions for the following timestamp
-
-    The function returns a generator over
-    t_i, EWMA of fct(y_i)
+    returns: a generator over (t_i, EWMA of fct(y_i))
     """
 
     def f(k):
@@ -216,12 +246,6 @@ def f(k):
 
         return times[k], _ewma
 
-    # if com:
-    #    alpha = 1 / (1 + com)
-
-    # if span:
-    #    alpha = 2 / (span + 1)
-
     if halflife:
         alpha = 1 - np.exp(-np.log(2) / halflife)
 
@@ -233,13 +257,17 @@ def f(k):
 
     # iterate through all the remaining rows of y. Start in the 2nd row
     for n, row in enumerate(y[1:], start=1):
+        # replace NaNs in row with non-NaN values from _ewma and vice versa
+
+        next_val = fct(row)
+
         # update the moving average
         if clip_at and n >= min_periods + 1:
             _ewma = _ewma + (1 - beta) * (
-                np.clip(fct(row), -clip_at * _ewma, clip_at * _ewma) - _ewma
+                np.clip(next_val, -clip_at * _ewma, clip_at * _ewma) - _ewma
             ) / (1 - np.power(beta, n + 1))
         else:
-            _ewma = _ewma + (1 - beta) * (fct(row) - _ewma) / (
+            _ewma = _ewma + (1 - beta) * (next_val - _ewma) / (
                 1 - np.power(beta, n + 1)
             )