Retrieving the Data
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
pd.core.common.is_list_like = pd.api.types.is_list_like
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split
from scipy import stats
%matplotlib inline
In [2]:
non_daily_input_list = [‘STLFSI’, ‘TB3MS’, ‘WM2NS’, ‘WM1NS’, ‘USROE’, ‘UNRATE’, ‘TOTLL’
, ‘TOTCI’,
‘TOTBKCR’,‘TERMCBAUTO48NS’,‘RECPROUSM156N’,‘PSAVERT’,‘PCE’
,‘PAYNSA’,‘INDPRO’,
‘CPIAUCSL’,‘CSUSHPINSA’,‘GDP’,‘Bill_Gross_Issues’,‘Bill_Ne
t’,‘Notes_Gross_Issues’,
‘Notes_Net’,‘Bonds_Gross_Issues’,‘Bonds_Net’,‘TOTAL_Gross_
Issues’,‘Total_Net_Cash Raised’,
’10-year issuance’,‘Investment Grade’,‘High Yield’,‘Total’
,‘Economic Policy Uncertainty’,
‘Monetary policy’,‘Fiscal Policy’,
‘Health care’,‘National security’,‘Entitlement programs’,‘Regul
ation’,‘Financial Regulation’,
‘Trade policy’,‘Sovereign debt’]
In [3]:
df_tot = pd.read_csv(‘dataset_iaqf.csv’,index_col=0)
In [4]:
df_tot.index = pd.to_datetime(df_tot.index)
Data Preprocessing
Some necessary functions
In [5]:
def create_Xt_Yt(X, y, percentage1=0.7,percentage2=0.15):
X_train = X.iloc[0:int(X.shape[0] * percentage1),:]
Y_train = y.iloc[0:int(y.shape[0] * percentage1)]
X_dev = X.iloc[int(X.shape[0]*percentage1):int(X.shape[0]*(percentage1+percentage2
)):,:]
Y_dev = y.iloc[int(y.shape[0]*percentage1):int(y.shape[0]*(percentage1+percentage2
)):]
X_test = X.iloc[int(X.shape[0]*(percentage1+percentage2)):,:]
Y_test = y.iloc[int(y.shape[0] * (percentage1+percentage2)):]
return X_train,X_dev, X_test,Y_train,Y_dev, Y_test
In [6]:
from sklearn.metrics import mean_squared_error
def plot_result(y_pred,y_test,modelName):
# plot the results
fig = plt.figure(figsize=(8,4))
index=y_test.index
plt.plot(index,y_test,‘.r-‘,label=‘true’)
plt.plot(index,y_pred,‘.b-‘,label=‘predict’)
plt.title(‘Credit Spread Prediction by ‘+modelName,fontsize = 20)
plt.legend(loc=‘upper right’)
plt.xlabel(‘Date’,fontsize = 15)
plt.ylabel(‘Credit Spread’,fontsize = 15)
print(modelName+‘ MSE:’+str(mean_squared_error(y_pred,y_test)))
In [7]:
from scipy.stats import kurtosis,skew,norm
def normFitPlot(dataset):
dataset = np.squeeze(dataset)
sns.set_context(“paper”, rc={“font.size”:30,“axes.titlesize”:30,“axes.labelsize”:20
})
sns.distplot(dataset , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(dataset)
kur = kurtosis(dataset)
sk = skew(dataset)
print( ‘n mu = {:.2f} and sigma = {:.2f}n‘.format(mu, sigma))
print( ‘n kurtosis = {:.2f} and skew = {:.2f}n‘.format(kur, sk))
#Now plot the distribution
plt.legend([‘Distribution $mu=$ {:.2f} and $sigma=$ {:.2f} n kurtosis = {:.2f} a
nd skew = {:.2f}n‘.format(mu, sigma,kur,sk)],
loc=‘best’,fontsize=10)
plt.ylabel(‘Frequency’,fontsize=20)
plt.title(‘traget distribution’,fontsize=30)
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(dataset, plot=plt)
plt.show()
Handle Missing Data
In [8]:
non_daily_column = non_daily_input_list
daily_column = df_tot.columns.difference(non_daily_column)
In [9]:
df_tot_nondaily = df_tot[non_daily_column]
df_tot_daily = df_tot[daily_column]
In [10]:
# Delete data on non-trading date
df_tot_daily = df_tot_daily[~df_tot_daily.SPY_High.isnull()]
df_tot_daily = df_tot_daily[~df_tot_daily.DGS10.isnull()]
df_tot_daily = df_tot_daily[~df_tot_daily.BAMLC0A1CAAAEY.isnull()]
In [11]:
df_tot_daily.isnull().sum()[df_tot_daily.isnull().sum()>0].sort_values(ascending = True
).head()
In [12]:
# Simply use a forward fill for other missing data
for i in df_tot_daily:
df_tot_daily[i] = df_tot_daily[i].fillna(method = ‘ffill’)
In [13]:
df_tot = df_tot_daily.join(df_tot_nondaily,how = ‘outer’)
Data Transformation
In [14]:
feature_col = df_tot.columns.difference([‘AAA10Y’])
Out[11]:
^VIX_ROC 1
SPY_TR 1
DPRIME 1
SPY_ROC 1
^VIX_TR 1
dtype: int64
In [15]:
# Transform our features which are skewed
skewed_feats = df_tot[feature_col].apply(lambda x: skew(x.dropna())).sort_values(ascend
ing=False)
skewDF = pd.DataFrame({‘Skew’ :skewed_feats})
skewDF.head()
In [16]:
skewness = skewDF
skewness = skewness.dropna()
print(“There are {} skewed features to Box Cox transform”.format(skewness.shape[0]))
skewed_features = skewness.index
lam = 0
df_tot_log = df_tot.copy()
for feat in skewed_features:
df_tot_log[feat] = np.log(np.abs(df_tot[feat])+1)*np.sign(df_tot[feat])
print(“Transform Finished!”)
In [17]:
skewed_feats = df_tot_log.apply(lambda x: skew(x.dropna())).sort_values(ascending=False
)
skewDF = pd.DataFrame({‘Skew’ :skewed_feats})
skewDF.head()
Out[15]:
Skew
^VIX_bl 65.095892
meltdown 14.353858
investment-grade bond broker sell-off |
8.330854 7.442682 6.761579 |
There are 160 skewed features to Box Cox transform
Transform Finished!
F:InstallationAnaconda3libsite-packagesipykernel_launcher.py:8: Runti
meWarning: invalid value encountered in sign
Out[17]:
Skew
investment-grade 7.104952
^VIX_bl 6.629973
bond broker 5.482865
sell-off 3.747732
TEDRATE 2.253001
Stationarity check
In [18]:
from statsmodels.tsa.stattools import adfuller
pvalue_dic_daily = {}
stationary_daily = []
non_stationary_daily = []
for i in daily_column.difference([‘AAA10Y’]):
cur = df_tot_log[i].dropna()
p = adfuller(cur)[1]
pvalue_dic_daily[i] = p
if p > 0.05:
non_stationary_daily.append(i)
else:
stationary_daily.append(i)
In [19]:
d_daily = {}
stationary_daily = stationary_daily+[‘AAA10Y’]
df_tot_station = df_tot_log[stationary_daily]
for i in non_stationary_daily:
cur = df_tot_log[i].dropna()
p = adfuller(cur)[1]
j = 0
while p >= 0.05:
cur = cur.diff(1).fillna(0)
p = adfuller(cur)[1]
j = j+1
df_tot_station = df_tot_station.join(cur,how=‘outer’)
d_daily[i] = j
F:InstallationAnaconda3libsite-packagesstatsmodelsregressionlinear_
model.py:867: RuntimeWarning: divide by zero encountered in log
llf = -nobs2*np.log(2*np.pi) – nobs2*np.log(ssr / nobs) – nobs2
F:InstallationAnaconda3libsite-packagesstatsmodelsbasemodel.py:129
4: RuntimeWarning: invalid value encountered in true_divide
return self.params / self.bse
F:InstallationAnaconda3libsite-packagesscipystats_distn_infrastruct
ure.py:903: RuntimeWarning: invalid value encountered in greater
return (a < x) & (x < b)
F:InstallationAnaconda3libsite-packagesscipystats_distn_infrastruct
ure.py:903: RuntimeWarning: invalid value encountered in less
return (a < x) & (x < b)
F:InstallationAnaconda3libsite-packagesscipystats_distn_infrastruct
ure.py:1827: RuntimeWarning: invalid value encountered in greater_equal
cond2 = (x >= np.asarray(_b)) & cond0
In [20]:
# check how many time we differenciate our daily features
d_daily
Out[20]:
{‘BAMLC0A1CAAAEY’: 1,
‘BAMLH0A0HYM2’: 1,
‘DAAA’: 1,
‘DCOILWTICO’: 1,
‘DEXUSEU’: 1,
‘DFF’: 1,
‘DGS1’: 1,
‘DGS10’: 1,
‘DGS2’: 1,
‘DGS3’: 1,
‘DGS3MO’: 1,
‘DGS5’: 1,
‘DGS6MO’: 1,
‘DPRIME’: 1,
‘DTWEXB’: 1,
‘DTWEXM’: 1,
‘Federal Reserve’: 1,
‘GDPC1’: 1,
‘GOLDAMGBD228NLBM’: 1,
‘Investment’: 1,
‘Momentum’: 1,
‘SPY_Adj Close’: 1,
‘SPY_Close’: 1,
‘SPY_High’: 1,
‘SPY_Low’: 1,
‘SPY_MA10’: 1,
‘SPY_MA20’: 1,
‘SPY_MA5’: 1,
‘SPY_NormalizedPrice’: 1,
‘SPY_Open’: 1,
‘SPY_TR’: 1,
‘SPY_Volume’: 1,
‘T10Y2Y’: 1,
‘T10YFF’: 1,
‘USD1MTD156N’: 1,
‘USD3MTD156N’: 1,
‘Y10Y5’: 1,
‘Y3Y2’: 1,
‘Y5Y3’: 1,
‘bidder’: 1,
‘bond broker’: 1,
‘buy bond’: 1,
‘corporate bond’: 1,
‘crash’: 1,
‘credit cycle’: 1,
‘credit rating’: 1,
‘credit spread’: 1,
‘debt fund’: 1,
‘default’: 1,
‘downgrade’: 1,
‘funding cost’: 1,
‘interest expense’: 1,
‘interest penalty’: 1,
‘investment-grade’: 1,
‘loan’: 1,
‘refinance’: 1,
‘rf’: 1,
‘sell-off’: 1,
‘stock’: 1,
In [21]:
pvalue_dic_nd = {}
stationary_nd = []
non_stationary_nd = []
for i in non_daily_column:
try:
cur = df_tot_log[i].dropna()
p = adfuller(cur)[1]
except:
print(adfuller(cur))
pvalue_dic_nd[i] = p
if p > 0.05:
non_stationary_nd.append(i)
else:
stationary_nd.append(i)
In [22]:
d_nd = {}
for i in non_stationary_nd:
cur = df_tot_log[i].dropna()
p = adfuller(cur)[1]
j = 0
while p >= 0.05:
cur = cur.diff(1).fillna(0)
p = adfuller(cur)[1]
j = j+1
df_tot_station = df_tot_station.join(cur,how=‘outer’)
d_nd[i] = j
df_tot_station = df_tot_station.join(df_tot_log[stationary_nd],how=‘outer’)
‘stock market’: 1,
‘unemployment’: 1,
‘volatility’: 1}
In [23]:
d_nd
Most features became stationary after one or two differenciation
In [24]:
df_tot_station = df_tot_station[~df_tot_station.SPY_High.isnull()]
df_tot_station = df_tot_station[~df_tot_station.DGS10.isnull()]
df_tot_station = df_tot_station[~df_tot_station.BAMLC0A1CAAAEY.isnull()]
In [25]:
for i in df_tot_station.columns:
df_tot_station[i] = df_tot_station[i].fillna(method=‘ffill’)
df_tot_station.dropna(inplace=True)
Target and input seperation
Out[23]:
{‘STLFSI’: 1,
‘TB3MS’: 2,
‘WM2NS’: 1,
‘WM1NS’: 1,
‘USROE’: 1,
‘UNRATE’: 2,
‘TOTLL’: 1,
‘TOTCI’: 1,
‘TOTBKCR’: 1,
‘TERMCBAUTO48NS’: 2,
‘RECPROUSM156N’: 1,
‘PSAVERT’: 1,
‘PCE’: 1,
‘PAYNSA’: 1,
‘INDPRO’: 1,
‘CPIAUCSL’: 1,
‘CSUSHPINSA’: 2,
‘GDP’: 1,
‘Bill_Gross_Issues’: 1,
‘Notes_Gross_Issues’: 1,
‘Notes_Net’: 1,
‘Bonds_Gross_Issues’: 1,
‘Bonds_Net’: 1,
‘TOTAL_Gross_Issues’: 1,
‘Total_Net_Cash Raised’: 1,
’10-year issuance’: 1,
‘Investment Grade’: 1,
‘High Yield’: 1,
‘Total’: 1,
‘Economic Policy Uncertainty’: 1,
‘Health care’: 1,
‘Entitlement programs’: 1,
‘Regulation’: 1,
‘Financial Regulation’: 1,
‘Trade policy’: 1}
In [26]:
y = df_tot_station[‘AAA10Y’].shift(-1)
X = df_tot_station.drop([‘AAA10Y’],axis = 1)
In [27]:
X[‘y_lag1’] = df_tot_station[‘AAA10Y’]
X = X[:-1]
y = y[:-1]
y.name = ‘Credit Spread’
Hidden Markov Model to get market regimes
In [28]:
from hmmlearn.hmm import GaussianHMM
remodel1 = GaussianHMM(n_components=3, covariance_type=“full”, n_iter=1000,algorithm=‘m
ap’)
remodel1.fit(np.array(y).reshape(-1,1))
Z2 = remodel1.predict(np.array(y).reshape(-1,1))
In [29]:
X[‘market regime_cs’] = pd.DataFrame(Z2,index=X.index).astype(‘category’)
In [30]:
X_train,X_dev,X_test,y_train,y_dev,y_test = create_Xt_Yt(X,y,0.7,0.15)
In [31]:
# take a look at the dataset
plt.figure(figsize = (4,3))
for i in range(remodel1.n_components):
pos = (X_train[‘market regime_cs’] == i)
normFitPlot(y_train[pos])
mu = 1.67 and sigma = 0.11
kurtosis = 0.11 and skew = -0.76
mu = 0.86 and sigma = 0.14
kurtosis = 0.64 and skew = 0.99
mu = 2.08 and sigma = 0.25
kurtosis = 0.83 and skew = 1.37
Data Exploration
In [32]:
# take a look at the dataset
plt.figure(figsize = (18,9))
for i in range(remodel1.n_components):
pos = (X_train[‘market regime_cs’] == i)
plt.plot(X_train.index[pos],y_train[pos],‘.’,label=‘hidden market regime {}‘.format
(i),lw=1)
plt.legend(loc=‘best’)
plt.xlabel(‘Date’,fontsize=18)
plt.ylabel(‘credit spread’,fontsize=18)
F:InstallationAnaconda3libsite-packagespandasplotting_converter.py:
129: FutureWarning: Using an implicitly registered datetime converter for
a matplotlib plotting method. The converter was registered by pandas on im
port. Future versions of pandas will require you to explicitly register ma
tplotlib converters.
To register the converters:
>>> from pandas.plotting import register_matplotlib_converters
>>> register_matplotlib_converters()
warnings.warn(msg, FutureWarning)
Out[32]:
Text(0, 0.5, ‘credit spread’)
In [33]:
# take a look at the distribution of y_train
normFitPlot(y_train)
In [34]:
# use the model to generate 50000 sample then compare the distribution with the credit
spread
a,_ = remodel1.sample(50000)
mu = 1.57 and sigma = 0.51
kurtosis = -0.68 and skew = -0.25
In [35]:
normFitPlot(a)
The generated sample has similar distribution with the credit spread, so we think this HMM model works
Also notice y_train is right-skewed and has fat tail
mu = 1.49 and sigma = 0.47
kurtosis = -0.89 and skew = -0.12
In [36]:
# log-transform for the y_train
y_train_log = np.log(y_train+1)
y_train_sqrt = np.sqrt(y_train+1)
normFitPlot(y_train_log)
In [37]:
n_col = X_train.columns.difference([‘market regime_cs’])
mu = 0.92 and sigma = 0.21
kurtosis = -0.81 and skew = -0.57
In [38]:
from sklearn.preprocessing import MinMaxScaler
x_scaler=MinMaxScaler()
X_train[n_col]=pd.DataFrame(x_scaler.fit_transform(X_train[n_col]),columns = n_col,inde
x=X_train.index)
X_dev[n_col]=pd.DataFrame(x_scaler.transform(X_dev[n_col]),columns = n_col,index=X_dev.
index)
X_test[n_col]=pd.DataFrame(x_scaler.transform(X_test[n_col]),columns = n_col,index=X_te
st.index)
F:InstallationAnaconda3libsite-packagespandascoreframe.py:3391: Set
tingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-doc
s/stable/indexing.html#indexing-view-versus-copy
self[k1] = value[k2]
F:InstallationAnaconda3libsite-packagespandascoreframe.py:3391: Set
tingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-doc
s/stable/indexing.html#indexing-view-versus-copy
self[k1] = value[k2]
F:InstallationAnaconda3libsite-packagespandascoreframe.py:3391: Set
tingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-doc
s/stable/indexing.html#indexing-view-versus-copy
self[k1] = value[k2]
In [39]:
# Use PCA to VIX price features
vix_features = [‘^VIX_Adj Close’,‘^VIX_Close’, ‘^VIX_High’, ‘^VIX_Low’,‘^VIX_MA5’, ‘^VI
X_Open’,‘^VIX_NormalizedPrice’,‘^VIX_MA10’,‘^VIX_MA20’]
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99,svd_solver = ‘full’)
vix_new_train = pca.fit_transform(X_train[vix_features])
vix_new_dev = pca.transform(X_dev[vix_features])
vix_new_test = pca.transform(X_test[vix_features])
vix_new_train = pd.DataFrame(vix_new_train,columns=[‘vix_new1’,‘vix_new2’],index = X_tr
ain.index)
vix_new_dev = pd.DataFrame(vix_new_dev,columns=[‘vix_new1’,‘vix_new2’],index = X_dev.in
dex)
vix_new_test = pd.DataFrame(vix_new_test,columns=[‘vix_new1’,‘vix_new2’],index = X_test
.index)
X_train.drop(vix_features,axis=1,inplace=True)
X_dev.drop(vix_features,axis=1,inplace=True)
X_test.drop(vix_features,axis=1,inplace=True)
X_train = X_train.join(vix_new_train,how=‘left’)
X_dev = X_dev.join(vix_new_dev,how=‘left’)
X_test = X_test.join(vix_new_test,how=‘left’)
F:InstallationAnaconda3libsite-packagespandascoreframe.py:3940: Set
tingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-doc
s/stable/indexing.html#indexing-view-versus-copy
errors=errors)
In [40]:
#find feature importance using Random Forest
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
init_notebook_mode(connected=True)
from sklearn.ensemble import RandomForestRegressor
def find_importance(df_X,df_y,importance):
from sklearn.ensemble import RandomForestRegressor
import plotly.graph_objs as go
rf = RandomForestRegressor(n_estimators=1000, max_depth=5, min_samples_leaf=4, max_
features=0.1)
rf.fit(df_X, df_y)
features =df_X.columns.values
#plot feature importance
x, y = (list(x) for x in zip(*sorted(zip(rf.feature_importances_, features), revers
e = False)))
trace2 = go.Bar(
x=x,
y=y,
marker=dict(
color=x,
colorscale = ‘Viridis’,
reversescale = True
),
name=‘Random Forest Feature importance’,
orientation=‘h’,
)
layout = dict(
title=‘Barplot of Feature importances’,
width = 900, height = 1000,
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=True,
# domain=[0, 0.85],
),
margin=dict(
l=300,
),
)
fig1 = go.Figure(data=[trace2])
fig1[‘layout’].update(layout)
py.iplot(fig1, filename=‘plots’)
for i in range(len(x)):
if x[i]<importance:
df_X=df_X.drop(y[i],axis=1)
return df_X
In [41]:
X_trainnew=find_importance(X_train,y_train_log,0.005)
X_devnew = X_dev[X_trainnew.columns]
X_testnew=X_test[X_trainnew.columns]
0 0.05 0
DPRIME
bond broker
rf
TDRETADJ30
SPY_Volume
unemployment
DAAA
SPY_Low
mktrf
Y5Y3
smb
Y3Y2
credit rating
TDRETADJ5
BAMLC0A1CAAAEY
loan
Momentum
DGS3
^VIX_deltaMA5
SPY_MA5
^VIX_MTM6
refinance
hml
^VIX_bu
^VIX_deltaMA20
^VIX_deltaMA5AndMA20
TOTCI
TDRETADJ1
SPY_MA20
DGS1
SPY_deltaMA5AndMA20
SPY_bu
meltdown
downgrade
USD3MTD156N
USD1MTD156N
Financial Regulation
Investment Grade
Total_Net_Cash Raised
TOTAL_Gross_Issues
Bill_Gross_Issues
^VIX_TR
crisis
Bonds_Gross_Issues
Monetary policy
Notes_Gross_Issues
buy bond
USROE
Sovereign debt
BDSpread_short
market regime_cs
Fiscal Policy
Barplot of Feature importances
In [42]:
corr = X_trainnew.corr()
f, ax = plt.subplots(figsize=(16, 9))
plt.title(“Correlation between features”,fontsize = 30)
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
plt.savefig(‘corr.png’)
Models
In [43]:
from sklearn.linear_model import Lasso,LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
In [44]:
lr = LinearRegression()
ridge = Ridge(random_state=1,alpha =0.0005)
lasso = Lasso(random_state=1,alpha =0.0005)
rfr = RandomForestRegressor(n_estimators=700,max_features=‘auto’,oob_score=True)
xgboost = XGBRegressor(learning_rate=0.01, n_estimators=1000, max_depth=3,min_samples_
split=5,min_samples_leaf=5,
max_features=7, subsample=0.7, random_state=10)
In [45]:
label = [‘Linear Regression’, ‘Lasso Regression’,‘Ridge Regression’,‘Random Forest Regr
ession’, ‘XGBRegressor’]
re_list = [lr,lasso,ridge,rfr, xgboost]
for re, label in zip(re_list, label):
re.fit(X_trainnew,y_train_log)
y_pred=re.predict(X_testnew)
plot_result(np.exp(y_pred)-1,y_test,label)
Linear Regression MSE:0.002075305310968708
Lasso Regression MSE:0.001044632151202771
Ridge Regression MSE:0.002074849782881088
Random Forest Regression MSE:0.0027098063981319553
[01:41:30] WARNING: C:UsersAdministratorworkspacexgboost-win64_release
_1.1.0srclearner.cc:480:
Parameters: { max_features, min_samples_leaf, min_samples_split } might no
t be used.
This may not be accurate due to some parameters are only used in languag
e bindings but
passed down to XGBoost core. Or some parameters are not used but slip t
hrough this
verification. Please open an issue if you find above cases.
XGBRegressor MSE:0.0009180653982498773