Spaces:

tmotagam
/

VARIMA-demo

Sleeping

File size: 24,126 Bytes

31158e5

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5cWJcqtEjnbd"
      },
      "source": [
        "# ETH with Vector Autoregressive (VAR) model\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sOQKeFpujnbf"
      },
      "source": [
        "## Importing/Downloading all the libraries required\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xjrODJyZjnbg"
      },
      "outputs": [],
      "source": [
        "import re\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
        "from statsmodels.tsa.api import VAR\n",
        "from statsmodels.tsa.stattools import adfuller\n",
        "from statsmodels.tsa.stattools import grangercausalitytests\n",
        "from statsmodels.tsa.vector_ar.vecm import coint_johansen\n",
        "from statsmodels.stats.stattools import durbin_watson"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KjO6Yh6Mjnbh"
      },
      "source": [
        "## Data Preprocessing\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WGkqWrFNjnbh"
      },
      "source": [
        "### Importing and summarizing the datasets\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2LBKAyZ1jnbh"
      },
      "outputs": [],
      "source": [
        "sentimentdf = pd.read_parquet(\"hf://datasets/tmotagam/Cryptocurrencies-sentiment-from-X/ETH-sentiment-dataset.parquet\")\n",
        "sentimentdf.drop('id', axis=1, inplace=True)\n",
        "sentimentdf.set_index('date', inplace=True)\n",
        "ethdf = pd.read_excel('ETH-USD.xlsx', parse_dates=['timestamp'], index_col=0)\n",
        "print('====================================================================================')\n",
        "print('ETH Sentiment Summary:')\n",
        "print(sentimentdf.describe())\n",
        "print('====================================================================================')\n",
        "print('ETH Sentiment Data:')\n",
        "print(sentimentdf.tail())\n",
        "print('====================================================================================')\n",
        "print('ETH Price Summary:')\n",
        "print(ethdf.describe())\n",
        "print('====================================================================================')\n",
        "print('ETH Price Data:')\n",
        "print(ethdf.tail())\n",
        "print('====================================================================================')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iPGJYM6sxcRZ"
      },
      "source": [
        "### Removing duplicate and unwanted data points, columns from the datasets"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "e2BROPqpxjIq"
      },
      "outputs": [],
      "source": [
        "sentimentdf['tmpdate'] = sentimentdf.index\n",
        "date_ids = sentimentdf['tmpdate'].unique()\n",
        "for date in date_ids:\n",
        "  tmpdf = sentimentdf[sentimentdf['tmpdate'] == date]\n",
        "  tmpdf = tmpdf.drop_duplicates()\n",
        "  sentimentdf = pd.concat([sentimentdf, tmpdf]).drop_duplicates()\n",
        "sentimentdf = sentimentdf.drop('tmpdate', axis=1)\n",
        "ethdf.drop(['low', 'open', 'volume', 'close', 'high'], axis=1, inplace=True)\n",
        "ethdf = ethdf.loc['2021-12-29':]\n",
        "print('====================================================================================')\n",
        "print('ETH Sentiment Summary:')\n",
        "print(sentimentdf.describe())\n",
        "print('====================================================================================')\n",
        "print('ETH Sentiment Data:')\n",
        "print(sentimentdf.head())\n",
        "print('====================================================================================')\n",
        "print('ETH Price Summary:')\n",
        "print(ethdf.describe())\n",
        "print('====================================================================================')\n",
        "print('ETH Price Data:')\n",
        "print(ethdf.head())\n",
        "print('====================================================================================')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N4YKJdyONEp3"
      },
      "source": [
        "### Getting sentiment score and there average using VADER"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YCJQoAh5NYQ3"
      },
      "outputs": [],
      "source": [
        "analyzer = SentimentIntensityAnalyzer()\n",
        "sentimentdf['neg'] = [analyzer.polarity_scores(re.sub(r\"(@[A-Za-z0–9_]+)|[^\\w\\s]|#|http\\S+\", \"\", x.replace(\"\\n\",\" \")))['neg'] for x in sentimentdf['content']]\n",
        "sentimentdf['pos'] = [analyzer.polarity_scores(re.sub(r\"(@[A-Za-z0–9_]+)|[^\\w\\s]|#|http\\S+\", \"\", x.replace(\"\\n\",\" \")))['pos'] for x in sentimentdf['content']]\n",
        "sentimentdf['neu'] = [analyzer.polarity_scores(re.sub(r\"(@[A-Za-z0–9_]+)|[^\\w\\s]|#|http\\S+\", \"\", x.replace(\"\\n\",\" \")))['neu'] for x in sentimentdf['content']]\n",
        "sentimentdf.drop(['content'], axis=1, inplace=True)\n",
        "df_grouped = sentimentdf.groupby(sentimentdf.index.date)\n",
        "averages = df_grouped.apply(lambda x: np.sum(x, axis=0) / x.shape[0])\n",
        "averages_reshape = np.vstack(averages.values)\n",
        "df_averages = pd.DataFrame(averages_reshape, index=averages.index, columns=sentimentdf.columns)\n",
        "print('====================================================================================')\n",
        "print('ETH Sentiment Summary:')\n",
        "print(df_averages.describe())\n",
        "print('====================================================================================')\n",
        "print('ETH Sentiment Data:')\n",
        "print(df_averages.head())\n",
        "print('====================================================================================')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8dcH6IsPa1Sc"
      },
      "source": [
        "### Combining the two datasets"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PiX5pqoSa5Pe"
      },
      "outputs": [],
      "source": [
        "df = ethdf.assign(neg=df_averages['neg'], pos=df_averages['pos'], neu=df_averages['neu'])\n",
        "print('====================================================================================')\n",
        "print('Summary:')\n",
        "print(df.describe())\n",
        "print('====================================================================================')\n",
        "print('Data:')\n",
        "print(df.head())\n",
        "print('====================================================================================')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UNvMzkrFjnbi"
      },
      "source": [
        "### Plotting the dataset\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BLQK4ZLkjnbi"
      },
      "outputs": [],
      "source": [
        "fig, axes = plt.subplots(nrows=4, ncols=1, dpi=120, figsize=(10,6))\n",
        "for i, ax in enumerate(axes.flatten()):\n",
        "    data = df[df.columns[i]]\n",
        "    ax.plot(data, color='red', linewidth=1)\n",
        "    ax.set_title(df.columns[i])\n",
        "    ax.xaxis.set_ticks_position('none')\n",
        "    ax.yaxis.set_ticks_position('none')\n",
        "    ax.spines[\"top\"].set_alpha(0)\n",
        "    ax.tick_params(labelsize=6)\n",
        "\n",
        "plt.tight_layout()\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tzrqJ76vjnbi"
      },
      "source": [
        "### Granger Causality Test\n",
        "\n",
        "Granger Causality Test is of all possible combinations of the Time series.\n",
        "The rows are the response variable, columns are predictors. The values in the table\n",
        "are the P-Values. P-Values lesser than the significance level (0.05), implies\n",
        "the Null Hypothesis that the coefficients of the corresponding past values is\n",
        "zero, that is, the X does not cause Y can be rejected.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9pEuAjX_jnbi"
      },
      "outputs": [],
      "source": [
        "maxlag=12\n",
        "test = 'ssr_chi2test'\n",
        "def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):\n",
        "    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)\n",
        "    for c in df.columns:\n",
        "        for r in df.index:\n",
        "            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag)\n",
        "            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]\n",
        "            min_p_value = np.min(p_values)\n",
        "            df.loc[r, c] = min_p_value\n",
        "    df.columns = [var + '_x' for var in variables]\n",
        "    df.index = [var + '_y' for var in variables]\n",
        "    return df\n",
        "\n",
        "grangers_causation_matrix(df, variables = df.columns)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L-bLcFUmjnbj"
      },
      "source": [
        "### Johanson's Cointegration Test\n",
        "\n",
        "The Johansen test, named after Søren Johansen, is a procedure for testing cointegration of several, say k, I(1) time series.\n",
        "This test permits more than one cointegrating relationship so is more generally applicable than the Engle–Granger test which is based on the Dickey–Fuller (or the augmented) test for unit roots in the residuals from a single (estimated) cointegrating relationship.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sfheMT7Rjnbr"
      },
      "outputs": [],
      "source": [
        "def cointegration_test(df, alpha=0.05):\n",
        "    out = coint_johansen(df,-1,5)\n",
        "    d = {'0.90':0, '0.95':1, '0.99':2}\n",
        "    traces = out.lr1\n",
        "    cvts = out.cvt[:, d[str(1-alpha)]]\n",
        "    def adjust(val, length= 6): return str(val).ljust(length)\n",
        "\n",
        "    # Summary\n",
        "    print('Name   ::  Test Stat > C(95%)    =>   Signif  \\n', '--'*20)\n",
        "    for col, trace, cvt in zip(df.columns, traces, cvts):\n",
        "        print(adjust(col), ':: ', adjust(round(trace,2), 9), \">\", adjust(cvt, 8), ' =>  ' , trace > cvt)\n",
        "\n",
        "cointegration_test(df)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HnUIdTkNjnbr"
      },
      "source": [
        "### Train and Test Split\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7i6WyC9ejnbs"
      },
      "outputs": [],
      "source": [
        "nobs = 10 # number of observations to be forecasted\n",
        "df_train, df_test = df[0:-nobs], df[-nobs:]\n",
        "\n",
        "print(df_train.shape)\n",
        "print(df_test.shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "h41rpvQajnbs"
      },
      "source": [
        "### ADFuller to test for Stationarity of given series\n",
        "\n",
        "An augmented Dickey–Fuller test (ADF) tests the null hypothesis that a unit root is present in a time series sample.\n",
        "The alternative hypothesis is different depending on which version of the test is used, but is usually stationarity or trend-stationarity.\n",
        "It is an augmented version of the Dickey–Fuller test for a larger and more complicated set of time series models.\n",
        "\n",
        "The augmented Dickey–Fuller (ADF) statistic, used in the test, is a negative number.\n",
        "The more negative it is, the stronger the rejection of the hypothesis that there is a unit root at some level of confidence.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wRCwciNGjnbs"
      },
      "outputs": [],
      "source": [
        "def adfuller_test(series,name, signif=0.05, verbose=False):\n",
        "    r = adfuller(series, autolag='AIC')\n",
        "    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}\n",
        "    p_value = output['pvalue']\n",
        "    def adjust(val, length= 6): return str(val).ljust(length)\n",
        "\n",
        "    print(f'    Augmented Dickey-Fuller Test on \"{name}\"', \"\\n   \", '-'*47)\n",
        "    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')\n",
        "    print(f' Significance Level    = {signif}')\n",
        "    print(f' Test Statistic        = {output[\"test_statistic\"]}')\n",
        "    print(f' No. Lags Chosen       = {output[\"n_lags\"]}')\n",
        "\n",
        "    for key,val in r[4].items():\n",
        "        print(f' Critical value {adjust(key)} = {round(val, 3)}')\n",
        "\n",
        "    if p_value <= signif:\n",
        "        print(f\" => P-Value = {p_value}. Rejecting Null Hypothesis.\")\n",
        "        print(f\" => Series is Stationary.\")\n",
        "    else:\n",
        "        print(f\" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.\")\n",
        "        print(f\" => Series is Non-Stationary.\")\n",
        "\n",
        "for name, column in df_train.items():\n",
        "    adfuller_test(column, name=name)\n",
        "    print('\\n')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "I_fi-I9Pjnbs"
      },
      "source": [
        "### Since the series is non stationary we will perform differencing and run the ADF test again\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2-xfJUfmjnbs"
      },
      "outputs": [],
      "source": [
        "df_differenced = df_train.diff().dropna()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "274M3lJAjnbt"
      },
      "outputs": [],
      "source": [
        "for name, column in df_differenced.items():\n",
        "    adfuller_test(column, name=name)\n",
        "    print('\\n')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lUtlnD9jjnbt"
      },
      "source": [
        "### Selecting Lag Order (p) for VAR model\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "J1nq05G0jnbt"
      },
      "outputs": [],
      "source": [
        "model = VAR(df_differenced)\n",
        "for i in [1,2,3,4,5,6,7,8,9]:\n",
        "    result = model.fit(i)\n",
        "    print('Lag Order =', i)\n",
        "    print('AIC : ', result.aic)\n",
        "    print('BIC : ', result.bic)\n",
        "    print('FPE : ', result.fpe)\n",
        "    print('HQIC: ', result.hqic, '\\n')\n",
        "\n",
        "x = model.select_order(maxlags=12)\n",
        "x.summary()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SolEp3_sjnbt"
      },
      "source": [
        "## Model Training\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5nlbY2WAjnbt"
      },
      "outputs": [],
      "source": [
        "model_fitted = model.fit(5)\n",
        "model_fitted.summary()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ecIylS0ijnbu"
      },
      "source": [
        "## Durbin Watson Test\n",
        "\n",
        "The Durbin–Watson statistic is a test statistic used to detect the presence of autocorrelation at lag 1 in the residuals (prediction errors) from a regression analysis.\n",
        "It is named after James Durbin and Geoffrey Watson.\n",
        "The small sample distribution of this ratio was derived by John von Neumann (von Neumann, 1941).\n",
        "Durbin and Watson (1950, 1951) applied this statistic to the residuals from least squares regressions, and developed bounds tests for the null hypothesis that the errors are serially uncorrelated against the alternative that they follow a first order autoregressive process.\n",
        "Note that the distribution of this test statistic does not depend on the estimated regression coefficients and the variance of the errors.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OjrrFXu_jnbu"
      },
      "outputs": [],
      "source": [
        "out = durbin_watson(model_fitted.resid)\n",
        "\n",
        "for col, val in zip(df.columns, out):\n",
        "    print(col, ':', round(val, 2))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1yoCIdbBjnbu"
      },
      "source": [
        "### Forecasting\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LPQNznqtjnbu"
      },
      "outputs": [],
      "source": [
        "# Get the lag order\n",
        "lag_order = model_fitted.k_ar\n",
        "print(lag_order)\n",
        "\n",
        "# Input data for forecasting\n",
        "forecast_input = df_differenced.values[-lag_order:]\n",
        "print(forecast_input)\n",
        "\n",
        "fc = model_fitted.forecast(y=forecast_input, steps=nobs)\n",
        "df_forecast = pd.DataFrame(fc, index=df.index[-nobs:], columns=df.columns + '_1d')\n",
        "df_forecast"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Em5XqOHajnbu"
      },
      "source": [
        "## Inversion of differencing\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0F1Kit38jnbu"
      },
      "outputs": [],
      "source": [
        "def invert_transformation(df_train, df_forecast, second_diff=False):\n",
        "    df_fc = df_forecast.copy()\n",
        "    columns = df_train.columns\n",
        "    for col in columns:\n",
        "        # Roll back 2nd Diff\n",
        "        if second_diff:\n",
        "            df_fc[str(col)+'_1d'] = (df_train[col].iloc[-1]-df_train[col].iloc[-2]) + df_fc[str(col)+'_2d'].cumsum()\n",
        "        # Roll back 1st Diff\n",
        "        df_fc[str(col)+'_forecast'] = df_train[col].iloc[-1] + df_fc[str(col)+'_1d'].cumsum()\n",
        "    return df_fc\n",
        "\n",
        "df_results = invert_transformation(df_train, df_forecast, second_diff=False)\n",
        "df_results.loc[:, ['adjclose_forecast', 'neg_forecast', 'pos_forecast', 'neu_forecast']]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dlYPfmnPjnbu"
      },
      "source": [
        "## Plot Forcast\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UemTi70xjnbv"
      },
      "outputs": [],
      "source": [
        "fig, axes = plt.subplots(nrows=len(df.columns), ncols=1, dpi=150, figsize=(10,10))\n",
        "for i, (col,ax) in enumerate(zip(df.columns, axes.flatten())):\n",
        "    df_results[col+'_forecast'].plot(legend=True, ax=ax).autoscale(axis='x',tight=True)\n",
        "    df_test[col][-nobs:].plot(legend=True, ax=ax)\n",
        "    ax.set_title(col + \": Forecast vs Actuals\")\n",
        "    ax.xaxis.set_ticks_position('none')\n",
        "    ax.yaxis.set_ticks_position('none')\n",
        "    ax.spines[\"top\"].set_alpha(0)\n",
        "    ax.tick_params(labelsize=6)\n",
        "\n",
        "plt.tight_layout()\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xTUG-kBsjnbv"
      },
      "source": [
        "## Error of Forecast\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VrOa5_hPjnbv"
      },
      "outputs": [],
      "source": [
        "def forecast_accuracy(forecast, actual):\n",
        "    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE\n",
        "    me = np.mean(forecast - actual)             # ME\n",
        "    mae = np.mean(np.abs(forecast - actual))    # MAE\n",
        "    mpe = np.mean((forecast - actual)/actual)   # MPE\n",
        "    rmse = np.mean((forecast - actual)**2)**.5  # RMSE\n",
        "    corr = np.corrcoef(forecast, actual)[0,1]   # corr\n",
        "    mins = np.amin(np.hstack([forecast[:,None],\n",
        "                              actual[:,None]]), axis=1)\n",
        "    maxs = np.amax(np.hstack([forecast[:,None],\n",
        "                              actual[:,None]]), axis=1)\n",
        "    minmax = 1 - np.mean(mins/maxs)             # minmax\n",
        "    return({'mape':mape, 'me':me, 'mae': mae,\n",
        "            'mpe': mpe, 'rmse':rmse, 'corr':corr, 'minmax':minmax})\n",
        "\n",
        "print('Forecast Accuracy of: adjclose')\n",
        "accuracy_prod = forecast_accuracy(df_results['adjclose_forecast'].values, df_test['adjclose'].values)\n",
        "for k, v in accuracy_prod.items():\n",
        "    print(k, ': ', round(v,4))\n",
        "\n",
        "print('\\nForecast Accuracy of: pos')\n",
        "accuracy_prod = forecast_accuracy(df_results['pos_forecast'].values, df_test['pos'].values)\n",
        "for k, v in accuracy_prod.items():\n",
        "    print(k, ': ', round(v,4))\n",
        "\n",
        "print('\\nForecast Accuracy of: neg')\n",
        "accuracy_prod = forecast_accuracy(df_results['neg_forecast'].values, df_test['neg'].values)\n",
        "for k, v in accuracy_prod.items():\n",
        "    print(k, ': ', round(v,4))\n",
        "\n",
        "print('\\nForecast Accuracy of: neu')\n",
        "accuracy_prod = forecast_accuracy(df_results['neu_forecast'].values, df_test['neu'].values)\n",
        "for k, v in accuracy_prod.items():\n",
        "    print(k, ': ', round(v,4))"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.8"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}