# import the module for simulating data
from statsmodels.tsa.arima_process import ArmaProcess
# Plot 1: MA parameter = -0.9
plt.subplot(2,1,1)
ar1 = np.array([1])
ma1 = np.array([1, -0.9])
MA_object1 = ArmaProcess(ar1, ma1)
simulated_data_1 = MA_object1.generate_sample(nsample=1000)
plt.plot(simulated_data_1)
# Plot 2: MA parameter = +0.9
plt.subplot(2,1,2)
ar2 = np.array([1])
ma2 = np.array([1, 0.9])
MA_object2 = ArmaProcess(ar2, ma2)
simulated_data_2 = MA_object2.generate_sample(nsample=1000)
plt.plot(simulated_data_2)
plt.show()
# Plot 3: MA parameter = -0.3
plot_acf(simulated_data_3, lags=20)
plt.show()
# Import the ARMA module from statsmodels
from statsmodels.tsa.arima_model import ARMA
# Fit an MA(1) model to the first simulated data
mod = ARMA(simulated_data_1, order=(0,1))
res = mod.fit()
# Print out summary information on the fit
print(res.summary())
# Print out the estimate for the constant and for theta
print("When the true theta=-0.9, the estimate of theta (and the constant) are:")
print(res.params)
# Import the ARMA module from statsmodels
from statsmodels.tsa.arima_model import ARMA
# Forecast the first MA(1) model
mod = ARMA(simulated_data_1, order=(0,1))
res = mod.fit()
res.plot_predict(start=990, end=1010)
plt.show()
# import datetime module
import datetime
# Change the first date to zero
intraday.iloc[0,0] = 0
# Change the column headers to 'DATE' and 'CLOSE'
intraday.columns = ['DATE', 'CLOSE']
# Examine the data types for each column
print(intraday.dtypes)
# Convert DATE column to numeric
intraday['DATE'] = pd.to_numeric(intraday['DATE'])
# Make the `DATE` column the new index
intraday = intraday.set_index('DATE')
# Notice that some rows are missing
print("If there were no missing rows, there would be 391 rows of minute data")
print("The actual length of the DataFrame is:", len(intraday))
# Everything
set_everything = set(range(391))
# The intraday index as a set
set_intraday = set(intraday.index)
# Calculate the difference
set_missing = set_everything - set_intraday
# Print the difference
print("Missing rows: ", set_missing)
# Fill in the missing rows
intraday = intraday.reindex(range(391), method='ffill')
# From previous step
intraday = intraday.reindex(range(391), method='ffill')
# Change the index to the intraday times
intraday.index = pd.date_range(start='2017-09-01 9:30', end='2017-09-01-16:00', freq='1min')
# Plot the intraday time series
intraday.plot(grid=True)
plt.show()
# Import plot_acf and ARMA modules from statsmodels
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.arima_model import ARMA
# Compute returns from prices and drop the NaN
returns = intraday.pct_change()
returns = returns.dropna()
# Plot ACF of returns with lags up to 60 minutes
plot_acf(returns, lags = 60)
plt.show()
# Fit the data to an MA(1) model
mod = ARMA(returns, order=(0,1))
res = mod.fit()
print(res.params)
# Import the adfuller module from statsmodels
from statsmodels.tsa.stattools import adfuller
# Compute the ADF for HO and NG
result_HO = adfuller(HO['Close'])
print("The p-value for the ADF test on HO is ", result_HO[1])
result_NG = adfuller(NG['Close'])
print("The p-value for the ADF test on NG is ", result_NG[1])
# Compute the ADF of the spread
result_spread = adfuller(7.25 * HO['Close'] - NG['Close'])
print("The p-value for the ADF test on the spread is ", result_spread[1])
# Import the statsmodels module for regression and the adfuller function
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
# Regress BTC on ETH
ETH = sm.add_constant(ETH)
result = sm.OLS(BTC,ETH).fit()
# Compute ADF
b = result.params[1]
adf_stats = adfuller(BTC['Price'] - b*ETH['Price'])
print("The p-value for the ADF test is ", adf_stats[1])
# Import the adfuller function from the statsmodels module
from statsmodels.tsa.stattools import adfuller
# Convert the index to a datetime object
temp_NY.index = pd.to_datetime(temp_NY.index, format='%Y')
# Plot average temperatures
temp_NY.plot()
plt.show()
# Compute and print ADF p-value
result = adfuller(temp_NY['TAVG'])
print("The p-value for the ADF test is ", result[1])
# Import the modules for plotting the sample ACF and PACF
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Take first difference of the temperature Series
chg_temp = temp_NY.diff()
chg_temp = chg_temp.dropna()
# Plot the ACF and PACF on the same page
fig, axes = plt.subplots(2,1)
# Plot the ACF
plot_acf(chg_temp, lags=20, ax=axes[0])
# Plot the PACF
plot_pacf(chg_temp, lags=20, ax=axes[1])
plt.show()
# Import the module for estimating an ARMA model
from statsmodels.tsa.arima_model import ARMA
# Fit the data to an AR(1) model and print AIC:
mod_ar1 = ARMA(chg_temp, order=(1, 0))
res_ar1 = mod_ar1.fit()
print("The AIC for an AR(1) is: ", res_ar1.aic)
# Fit the data to an AR(2) model and print AIC:
mod_ar2 = ARMA(chg_temp, order=(0, 1))
res_ar2 = mod_ar2.fit()
print("The AIC for an AR(2) is: ", res_ar2.aic)
# Fit the data to an ARMA(1,1) model and print AIC:
mod_arma11 = ARMA(chg_temp, order=(1, 1))
res_arma11 = mod_arma11.fit()
print("The AIC for an ARMA(1,1) is: ", res_arma11.aic)
# Import the ARIMA module from statsmodels
from statsmodels.tsa.arima_model import ARIMA
# Forecast temperatures using an ARIMA(1,1,1) model
mod = ARIMA(temp_NY, order=(1,1,1))
res = mod.fit()
# Plot the original series and the forecasted series
res.plot_predict(start='1872-01-01', end='2046-01-01')
plt.show()
# Split the data into a train and test set
candy_train = candy.loc[:'2006']
candy_test = candy.loc['2007':]
# Create an axis
fig, ax = plt.subplots()
# Plot the train and test sets on the axis ax
candy_train.plot(ax=ax)
candy_test.plot(ax=ax)
plt.show()
| AR(p) | MA(q) | ARMA(p,q) | |
|---|---|---|---|
| ACF | Tails off | Cuts off after lag q | Tails off |
| PACF | Cuts off after lag p |
Tails off | Tails off |
# Import
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Create figure
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
# Plot the ACF of df
plot_acf(df, lags=10, zero=False, ax=ax1)
# Plot the PACF of df
plot_pacf(df, lags=10, zero=False, ax=ax2)
plt.show()
# Instantiate model
model = SARIMAX(earthquake, order=(1,0,0))
# Train model
results = model.fit()
# Create empty list to store search results
order_aic_bic=[]
# Loop over p values from 0-2
for p in range(3):
# Loop over q values from 0-2
for q in range(3):
# create and fit ARMA(p,q) model
model = SARIMAX(df, order=(p,0,q))
results = model.fit()
# Append order and results tuple
order_aic_bic.append((p,q,results.aic, results.bic))
# Construct DataFrame from order_aic_bic
order_df = pd.DataFrame(order_aic_bic,
columns=['p', 'q', 'AIC', 'BIC'])
# Print order_df in order of increasing AIC
print(order_df.sort_values('AIC'))
# Print order_df in order of increasing BIC
print(order_df.sort_values('BIC'))
# Loop over p values from 0-2
for p in range(3):
# Loop over q values from 0-2
for q in range(3):
try:
# create and fit ARMA(p,q) model
model = SARIMAX(earthquake, order=(p,0,q))
results = model.fit()
# Print order and results
print(p, q, results.aic, results.bic)
except:
print(p, q, None, None)
# Fit model
model = SARIMAX(earthquake, order=(1,0,1))
results = model.fit()
# Calculate the mean absolute error from residuals
mae = np.mean(np.abs(results.resid))
# Print mean absolute error
print(mae)
# Make plot of time series for comparison
earthquake.plot()
plt.show()
Diagnostic summary statistics:
| Test | Null hypothesis | P-value name |
|---|---|---|
| Ljung-Box | There are no correlations in the residual |
Prob(Q) |
| Jarque-Bera | The residuals are normally distributed | Prob(JB) |
# Create and fit model
model1 = SARIMAX(df, order=(3,0,1))
results1 = model1.fit()
# Print summary
print(results1.summary())
Plot Diagnostic:
# Create the 4 diagostics plots
results.plot_diagnostics()
plt.show()
| Test | Good fit |
|---|---|
| Standardized residual | There are no obvious patterns in the residuals |
| Histogram plus kde estimate | The KDE curve should be very similar to the normal distribution |
| Normal Q-Q | Most of the data points should lie on the straight line |
| Correlogram | 95% of correlations for lag greater than zero should not be significant |
# Plot time series
savings.plot()
plt.show()
# Run Dicky-Fuller test
result = adfuller(savings['savings'])
# Print test statistic
print(result[0])
# Print p-value
print(result[1])
# Create figure
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
# Plot the ACF of savings on ax1
plot_acf(savings, lags=10, zero=False, ax=ax1)
# Plot the PACF of savings on ax2
plot_pacf(savings, lags=10, zero=False, ax=ax2)
plt.show()
# Loop over p values from 0-3
for p in range(4):
# Loop over q values from 0-3
for q in range(4):
try:
# Create and fit ARMA(p,q) model
model = SARIMAX(savings, order=(p,0,q), trend='c')
results = model.fit()
# Print p, q, AIC, BIC
print(p,q, results.aic, results.bic)
except:
print(p, q, None, None)
def interpolate_and_plot(prices, interpolation):
# Create a boolean mask for missing values
missing_values = prices.isna()
# Interpolate the missing values
prices_interp = prices.interpolate(interpolation)
# Plot the results, highlighting the interpolated values in black
fig, ax = plt.subplots(figsize=(10, 5))
prices_interp.plot(color='k', alpha=.6, ax=ax, legend=False)
# Now plot the interpolated values on top in red
prices_interp[missing_values].plot(ax=ax, color='r', lw=3, legend=False)
plt.show()
interpolation_type = 'quadratic' or could be ‘zero’ or ‘linear’
interpolate_and_plot(prices, interpolation_type)
# Your custom function
def percent_change(series):
# Collect all *but* the last value of this window, then the final value
previous_values = series[:-1]
last_value = series[-1]
# Calculate the % difference between the last value and the mean of earlier values
percent_change = (last_value - np.mean(previous_values)) / np.mean(previous_values)
return percent_change
# Apply your custom function and plot
prices_perc = prices.rolling(20).apply(percent_change)
prices_perc.loc["2014":"2015"].plot()
plt.show()
def replace_outliers(series):
# Calculate the absolute difference of each timepoint from the series mean
absolute_differences_from_mean = np.abs(series - np.mean(series))
# Calculate a mask for the differences that are > 3 standard deviations from zero
this_mask = absolute_differences_from_mean > (np.std(series) * 3)
# Replace these values with the median accross the data
series[this_mask] = np.nanmedian(series)
return series
# Apply your preprocessing function to the timeseries and plot the results
prices_perc = prices_perc.apply(replace_outliers)
prices_perc.loc["2014":"2015"].plot()
plt.show()
# Define a rolling window with Pandas, excluding the right-most datapoint of the window
prices_perc_rolling = prices_perc.rolling(20, min_periods=5, closed='right')
# Define the features you'll calculate for each window
features_to_calculate = [np.min, np.max, np.mean, np.std]
# Calculate these features for your rolling window object
features = prices_perc_rolling.aggregate(features_to_calculate)
# Plot the results
ax = features.loc[:"2011-01"].plot()
prices_perc.loc[:"2011-01"].plot(ax=ax, color='k', alpha=.2, lw=3)
ax.legend(loc=(1.01, .6))
plt.show()
# Import partial from functools
from functools import partial
percentiles = [1, 10, 25, 50, 75, 90, 99]
# Use a list comprehension to create a partial function for each quantile
percentile_functions = [partial(np.percentile, q=percentile) for percentile in percentiles]
# Calculate each of these quantiles on the data using a rolling window
prices_perc_rolling = prices_perc.rolling(20, min_periods=5, closed='right')
features_percentiles = prices_perc_rolling.aggregate(percentile_functions)
# Plot a subset of the result
ax = features_percentiles.loc[:"2011-01"].plot(cmap=plt.cm.viridis)
ax.legend(percentiles, loc=(1.01, .5))
plt.show()
# Extract date features from the data, add them as columns
prices_perc['day_of_week'] = prices_perc.index.dayofweek
prices_perc['week_of_year'] = prices_perc.index.weekofyear
prices_perc['month_of_year'] = prices_perc.index.month
# Print prices_perc
print(prices_perc)
# These are the "time lags"
shifts = np.arange(1, 11).astype(int)
# Use a dictionary comprehension to create name: value pairs, one pair per shift
shifted_data = {"lag_{}_day".format(day_shift): prices_perc.shift(day_shift) for day_shift in shifts}
# Convert into a DataFrame for subsequent use
prices_perc_shifted = pd.DataFrame(shifted_data)
# Plot the first 100 samples of each
ax = prices_perc_shifted.iloc[:100].plot(cmap=plt.cm.viridis)
prices_perc.iloc[:100].plot(color='r', lw=2)
ax.legend(loc='best')
plt.show()
# Replace missing values with the median for each column
X = prices_perc_shifted.fillna(np.nanmedian(prices_perc_shifted))
y = prices_perc.fillna(np.nanmedian(prices_perc))
# Fit the model
model = Ridge()
model.fit(X, y)
def visualize_coefficients(coefs, names, ax):
# Make a bar plot for the coefficients, including their names on the x-axis
ax.bar(names, coefs)
ax.set(xlabel='Coefficient name', ylabel='Coefficient value')
# Set formatting so it looks nice
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
return ax
# Visualize the output data up to "2011-01"
fig, axs = plt.subplots(2, 1, figsize=(10, 5))
y.loc[:'2011-01'].plot(ax=axs[0])
# Run the function to visualize model's coefficients
visualize_coefficients(model.coef_, prices_perc_shifted.columns, ax=axs[1])
plt.show()
# Visualize the output data up to "2011-01"
fig, axs = plt.subplots(2, 1, figsize=(10, 5))
y.loc[:'2011-01'].plot(ax=axs[0])
# Run the function to visualize model's coefficients
visualize_coefficients(model.coef_, prices_perc_shifted.columns, ax=axs[1])
plt.show()
# Create KFold cross-validation object
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=False, random_state=1)
# Import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit
# Create time-series cross-validation object
cv = TimeSeriesSplit(n_splits=10)
# Iterate through CV splits
fig, ax = plt.subplots()
for ii, (tr, tt) in enumerate(cv.split(X, y)):
# Plot the training data on each iteration, to see the behavior of the CV
ax.plot(tr, ii + y[tr])
ax.set(title='Training data on each CV iteration', ylabel='CV iteration')
plt.show()
from sklearn.utils import resample
def bootstrap_interval(data, percentiles=(2.5, 97.5), n_boots=100):
"""Bootstrap a confidence interval for the mean of columns of a 2-D dataset."""
# Create our empty array to fill the results
bootstrap_means = np.zeros([n_boots, data.shape[-1]])
for ii in range(n_boots):
# Generate random indices for our data *with* replacement, then take the sample mean
random_sample = resample(data)
bootstrap_means[ii] = random_sample.mean(axis=0)
# Compute the percentiles of choice for the bootstrapped means
percentiles = np.percentile(bootstrap_means, percentiles, axis=0)
return percentiles
# Iterate through CV splits
n_splits = 100
cv = TimeSeriesSplit(n_splits=n_splits)
# Create empty array to collect coefficients
coefficients = np.zeros([n_splits, X.shape[1]])
for ii, (tr, tt) in enumerate(cv.split(X, y)):
# Fit the model on training data and collect the coefficients
model.fit(X[tr], y[tr])
coefficients[ii] = model.coef_
# Calculate a confidence interval around each coefficient
bootstrapped_interval = bootstrap_interval(coefficients, percentiles=(2.5, 97.5))
# Plot it
fig, ax = plt.subplots()
ax.scatter(feature_names, bootstrapped_interval[0], marker='_', lw=3)
ax.scatter(feature_names, bootstrapåped_interval[1], marker='_', lw=3)
ax.set(title='95% confidence interval for model coefficients')
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()
# Pre-initialize window sizes
window_sizes = [25, 50, 75, 100]
# Create an empty DataFrame to collect the stores
all_scores = pd.DataFrame(index=times_scores)
# Generate scores for each split to see how the model performs over time
for window in window_sizes:
# Create cross-validation object using a limited lookback window
cv = TimeSeriesSplit(n_splits=100, max_train_size=window)
# Calculate scores across all CV splits and collect them in a DataFrame
this_scores = cross_val_score(model, X, y, cv=cv, scoring=my_pearsonr)
all_scores['Length {}'.format(window)] = this_scores
# Visualize the scores
ax = all_scores.rolling(10).mean().plot(cmap=plt.cm.coolwarm)
ax.set(title='Scores for multiple windows', ylabel='Correlation (r)')
plt.show()
没有评论:
发表评论