2022-05-03 08:14:17 +00:00
|
|
|
import copy
|
2022-05-04 15:42:34 +00:00
|
|
|
import datetime
|
|
|
|
import json
|
2022-05-04 15:53:40 +00:00
|
|
|
import logging
|
2022-05-04 15:42:34 +00:00
|
|
|
import pickle as pk
|
2022-05-05 13:35:51 +00:00
|
|
|
import shutil
|
2022-05-04 15:42:34 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
2022-05-04 15:42:34 +00:00
|
|
|
from joblib import dump, load
|
2022-05-03 08:14:17 +00:00
|
|
|
from pandas import DataFrame
|
|
|
|
from sklearn.metrics.pairwise import pairwise_distances
|
2022-05-04 15:42:34 +00:00
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
from freqtrade.configuration import TimeRange
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
SECONDS_IN_DAY = 86400
|
|
|
|
|
2022-05-04 15:53:40 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
|
2022-05-06 10:54:49 +00:00
|
|
|
class FreqaiDataKitchen:
|
2022-05-03 08:14:17 +00:00
|
|
|
"""
|
2022-05-04 15:42:34 +00:00
|
|
|
Class designed to handle all the data for the IFreqaiModel class model.
|
2022-05-03 08:14:17 +00:00
|
|
|
Functionalities include holding, saving, loading, and analyzing the data.
|
2022-05-03 08:28:13 +00:00
|
|
|
author: Robert Caulk, rob.caulk@gmail.com
|
2022-05-03 08:14:17 +00:00
|
|
|
"""
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def __init__(self, config: Dict[str, Any], dataframe: DataFrame):
|
2022-05-03 08:14:17 +00:00
|
|
|
self.full_dataframe = dataframe
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data: Dict[Any, Any] = {}
|
2022-05-05 12:37:37 +00:00
|
|
|
self.data_dictionary: Dict[Any, Any] = {}
|
2022-05-03 08:14:17 +00:00
|
|
|
self.config = config
|
2022-05-05 13:35:51 +00:00
|
|
|
self.freqai_config = config["freqai"]
|
2022-05-03 08:28:13 +00:00
|
|
|
self.predictions = np.array([])
|
|
|
|
self.do_predict = np.array([])
|
|
|
|
self.target_mean = np.array([])
|
|
|
|
self.target_std = np.array([])
|
2022-05-06 13:10:11 +00:00
|
|
|
self.full_predictions = np.array([])
|
|
|
|
self.full_do_predict = np.array([])
|
|
|
|
self.full_target_mean = np.array([])
|
|
|
|
self.full_target_std = np.array([])
|
2022-05-04 15:42:34 +00:00
|
|
|
self.model_path = Path()
|
|
|
|
self.model_filename = ""
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-05 13:35:51 +00:00
|
|
|
self.full_timerange = self.create_fulltimerange(
|
|
|
|
self.config["timerange"], self.freqai_config["train_period"]
|
|
|
|
)
|
|
|
|
|
|
|
|
(self.training_timeranges, self.backtesting_timeranges) = self.split_timerange(
|
|
|
|
self.full_timerange,
|
|
|
|
config["freqai"]["train_period"],
|
|
|
|
config["freqai"]["backtest_period"],
|
|
|
|
)
|
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
def save_data(self, model: Any) -> None:
|
|
|
|
"""
|
|
|
|
Saves all data associated with a model for a single sub-train time range
|
|
|
|
:params:
|
2022-05-04 15:42:34 +00:00
|
|
|
:model: User trained model which can be reused for inferencing to generate
|
2022-05-03 08:14:17 +00:00
|
|
|
predictions
|
|
|
|
"""
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
if not self.model_path.is_dir():
|
|
|
|
self.model_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
save_path = Path(self.model_path)
|
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
# Save the trained model
|
2022-05-04 15:42:34 +00:00
|
|
|
dump(model, save_path / str(self.model_filename + "_model.joblib"))
|
|
|
|
self.data["model_path"] = self.model_path
|
|
|
|
self.data["model_filename"] = self.model_filename
|
|
|
|
self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns)
|
2022-05-03 08:14:17 +00:00
|
|
|
# store the metadata
|
2022-05-04 15:42:34 +00:00
|
|
|
with open(save_path / str(self.model_filename + "_metadata.json"), "w") as fp:
|
|
|
|
json.dump(self.data, fp, default=self.np_encoder)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
# save the train data to file so we can check preds for area of applicability later
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data_dictionary["train_features"].to_pickle(
|
|
|
|
save_path / str(self.model_filename + "_trained_df.pkl")
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
def load_data(self) -> Any:
|
|
|
|
"""
|
|
|
|
loads all data required to make a prediction on a sub-train time range
|
|
|
|
:returns:
|
|
|
|
:model: User trained model which can be inferenced for new predictions
|
|
|
|
"""
|
2022-05-04 15:42:34 +00:00
|
|
|
model = load(self.model_path / str(self.model_filename + "_model.joblib"))
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
with open(self.model_path / str(self.model_filename + "_metadata.json"), "r") as fp:
|
2022-05-03 08:14:17 +00:00
|
|
|
self.data = json.load(fp)
|
2022-05-04 15:42:34 +00:00
|
|
|
self.training_features_list = self.data["training_features_list"]
|
|
|
|
# if self.data.get("training_features_list"):
|
|
|
|
# self.training_features_list = [*self.data.get("training_features_list")]
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data_dictionary["train_features"] = pd.read_pickle(
|
|
|
|
self.model_path / str(self.model_filename + "_trained_df.pkl")
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
self.model_path = self.data["model_path"]
|
|
|
|
self.model_filename = self.data["model_filename"]
|
|
|
|
if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]:
|
|
|
|
self.pca = pk.load(
|
|
|
|
open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "rb")
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return model
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def make_train_test_datasets(
|
|
|
|
self, filtered_dataframe: DataFrame, labels: DataFrame
|
|
|
|
) -> Dict[Any, Any]:
|
|
|
|
"""
|
|
|
|
Given the dataframe for the full history for training, split the data into
|
|
|
|
training and test data according to user specified parameters in configuration
|
|
|
|
file.
|
2022-05-03 08:14:17 +00:00
|
|
|
:filtered_dataframe: cleaned dataframe ready to be split.
|
|
|
|
:labels: cleaned labels ready to be split.
|
2022-05-04 15:42:34 +00:00
|
|
|
"""
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
if self.config["freqai"]["feature_parameters"]["weight_factor"] > 0:
|
2022-05-03 08:14:17 +00:00
|
|
|
weights = self.set_weights_higher_recent(len(filtered_dataframe))
|
2022-05-04 15:42:34 +00:00
|
|
|
else:
|
|
|
|
weights = np.ones(len(filtered_dataframe))
|
|
|
|
|
|
|
|
(
|
|
|
|
train_features,
|
|
|
|
test_features,
|
|
|
|
train_labels,
|
|
|
|
test_labels,
|
|
|
|
train_weights,
|
|
|
|
test_weights,
|
|
|
|
) = train_test_split(
|
|
|
|
filtered_dataframe[: filtered_dataframe.shape[0]],
|
2022-05-03 08:14:17 +00:00
|
|
|
labels,
|
|
|
|
weights,
|
2022-05-04 15:42:34 +00:00
|
|
|
**self.config["freqai"]["data_split_parameters"]
|
2022-05-03 08:14:17 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
return self.build_data_dictionary(
|
2022-05-04 15:42:34 +00:00
|
|
|
train_features, test_features, train_labels, test_labels, train_weights, test_weights
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def filter_features(
|
|
|
|
self,
|
|
|
|
unfiltered_dataframe: DataFrame,
|
|
|
|
training_feature_list: List,
|
|
|
|
labels: DataFrame = pd.DataFrame(),
|
|
|
|
training_filter: bool = True,
|
|
|
|
) -> Tuple[DataFrame, DataFrame]:
|
|
|
|
"""
|
|
|
|
Filter the unfiltered dataframe to extract the user requested features and properly
|
|
|
|
remove all NaNs. Any row with a NaN is removed from training dataset or replaced with
|
|
|
|
0s in the prediction dataset. However, prediction dataset do_predict will reflect any
|
2022-05-03 08:14:17 +00:00
|
|
|
row that had a NaN and will shield user from that prediction.
|
|
|
|
:params:
|
|
|
|
:unfiltered_dataframe: the full dataframe for the present training period
|
2022-05-04 15:42:34 +00:00
|
|
|
:training_feature_list: list, the training feature list constructed by
|
|
|
|
self.build_feature_list() according to user specified parameters in the configuration file.
|
2022-05-03 08:14:17 +00:00
|
|
|
:labels: the labels for the dataset
|
2022-05-04 15:42:34 +00:00
|
|
|
:training_filter: boolean which lets the function know if it is training data or
|
|
|
|
prediction data to be filtered.
|
2022-05-03 08:14:17 +00:00
|
|
|
:returns:
|
|
|
|
:filtered_dataframe: dataframe cleaned of NaNs and only containing the user
|
|
|
|
requested feature set.
|
|
|
|
:labels: labels cleaned of NaNs.
|
2022-05-04 15:42:34 +00:00
|
|
|
"""
|
2022-05-03 08:14:17 +00:00
|
|
|
filtered_dataframe = unfiltered_dataframe.filter(training_feature_list, axis=1)
|
2022-05-04 15:42:34 +00:00
|
|
|
drop_index = pd.isnull(filtered_dataframe).any(1) # get the rows that have NaNs,
|
|
|
|
drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement.
|
|
|
|
if (
|
|
|
|
training_filter
|
|
|
|
): # we don't care about total row number (total no. datapoints) in training, we only care
|
|
|
|
# about removing any row with NaNs
|
2022-05-03 08:14:17 +00:00
|
|
|
drop_index_labels = pd.isnull(labels)
|
2022-05-04 15:42:34 +00:00
|
|
|
drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0)
|
|
|
|
filtered_dataframe = filtered_dataframe[
|
|
|
|
(drop_index == 0) & (drop_index_labels == 0)
|
|
|
|
] # dropping values
|
|
|
|
labels = labels[
|
|
|
|
(drop_index == 0) & (drop_index_labels == 0)
|
|
|
|
] # assuming the labels depend entirely on the dataframe here.
|
2022-05-04 15:53:40 +00:00
|
|
|
logger.info(
|
2022-05-05 12:37:37 +00:00
|
|
|
"dropped %s training points due to NaNs, ensure all historical data downloaded",
|
2022-05-04 15:42:34 +00:00
|
|
|
len(unfiltered_dataframe) - len(filtered_dataframe),
|
|
|
|
)
|
|
|
|
self.data["filter_drop_index_training"] = drop_index
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
else:
|
|
|
|
# we are backtesting so we need to preserve row number to send back to strategy,
|
|
|
|
# so now we use do_predict to avoid any prediction based on a NaN
|
2022-05-03 08:14:17 +00:00
|
|
|
drop_index = pd.isnull(filtered_dataframe).any(1)
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data["filter_drop_index_prediction"] = drop_index
|
|
|
|
filtered_dataframe.fillna(0, inplace=True)
|
|
|
|
# replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction
|
|
|
|
# that was based on a single NaN is ultimately protected from buys with do_predict
|
2022-05-03 08:14:17 +00:00
|
|
|
drop_index = ~drop_index
|
2022-05-04 15:42:34 +00:00
|
|
|
self.do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
|
2022-05-04 15:53:40 +00:00
|
|
|
logger.info(
|
2022-05-05 12:37:37 +00:00
|
|
|
"dropped %s of %s prediction data points due to NaNs.",
|
2022-05-04 15:42:34 +00:00
|
|
|
len(self.do_predict) - self.do_predict.sum(),
|
|
|
|
len(filtered_dataframe),
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return filtered_dataframe, labels
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def build_data_dictionary(
|
|
|
|
self,
|
|
|
|
train_df: DataFrame,
|
|
|
|
test_df: DataFrame,
|
|
|
|
train_labels: DataFrame,
|
|
|
|
test_labels: DataFrame,
|
|
|
|
train_weights: Any,
|
|
|
|
test_weights: Any,
|
|
|
|
) -> Dict:
|
|
|
|
|
|
|
|
self.data_dictionary = {
|
|
|
|
"train_features": train_df,
|
|
|
|
"test_features": test_df,
|
|
|
|
"train_labels": train_labels,
|
|
|
|
"test_labels": test_labels,
|
|
|
|
"train_weights": train_weights,
|
|
|
|
"test_weights": test_weights,
|
|
|
|
}
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return self.data_dictionary
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
|
|
|
|
"""
|
2022-05-03 08:14:17 +00:00
|
|
|
Standardize all data in the data_dictionary according to the training dataset
|
|
|
|
:params:
|
|
|
|
:data_dictionary: dictionary containing the cleaned and split training/test data/labels
|
|
|
|
:returns:
|
|
|
|
:data_dictionary: updated dictionary with standardized values.
|
2022-05-04 15:42:34 +00:00
|
|
|
"""
|
2022-05-03 08:14:17 +00:00
|
|
|
# standardize the data by training stats
|
2022-05-04 15:42:34 +00:00
|
|
|
train_mean = data_dictionary["train_features"].mean()
|
|
|
|
train_std = data_dictionary["train_features"].std()
|
|
|
|
data_dictionary["train_features"] = (
|
|
|
|
data_dictionary["train_features"] - train_mean
|
|
|
|
) / train_std
|
|
|
|
data_dictionary["test_features"] = (
|
|
|
|
data_dictionary["test_features"] - train_mean
|
|
|
|
) / train_std
|
|
|
|
|
|
|
|
train_labels_std = data_dictionary["train_labels"].std()
|
|
|
|
train_labels_mean = data_dictionary["train_labels"].mean()
|
|
|
|
data_dictionary["train_labels"] = (
|
|
|
|
data_dictionary["train_labels"] - train_labels_mean
|
|
|
|
) / train_labels_std
|
|
|
|
data_dictionary["test_labels"] = (
|
|
|
|
data_dictionary["test_labels"] - train_labels_mean
|
|
|
|
) / train_labels_std
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
for item in train_std.keys():
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data[item + "_std"] = train_std[item]
|
|
|
|
self.data[item + "_mean"] = train_mean[item]
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data["labels_std"] = train_labels_std
|
|
|
|
self.data["labels_mean"] = train_labels_mean
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return data_dictionary
|
|
|
|
|
|
|
|
def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
|
2022-05-04 15:42:34 +00:00
|
|
|
"""
|
|
|
|
Standardizes a set of data using the mean and standard deviation from
|
2022-05-03 08:14:17 +00:00
|
|
|
the associated training data.
|
|
|
|
:params:
|
|
|
|
:df: Dataframe to be standardized
|
2022-05-04 15:42:34 +00:00
|
|
|
"""
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
for item in df.keys():
|
2022-05-04 15:42:34 +00:00
|
|
|
df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return df
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def split_timerange(
|
|
|
|
self, tr: str, train_split: int = 28, bt_split: int = 7
|
|
|
|
) -> Tuple[list, list]:
|
|
|
|
"""
|
2022-05-03 08:14:17 +00:00
|
|
|
Function which takes a single time range (tr) and splits it
|
|
|
|
into sub timeranges to train and backtest on based on user input
|
|
|
|
tr: str, full timerange to train on
|
|
|
|
train_split: the period length for the each training (days). Specified in user
|
|
|
|
configuration file
|
|
|
|
bt_split: the backtesting length (dats). Specified in user configuration file
|
2022-05-04 15:42:34 +00:00
|
|
|
"""
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
train_period = train_split * SECONDS_IN_DAY
|
|
|
|
bt_period = bt_split * SECONDS_IN_DAY
|
|
|
|
|
|
|
|
full_timerange = TimeRange.parse_timerange(tr)
|
2022-05-06 10:54:49 +00:00
|
|
|
config_timerange = TimeRange.parse_timerange(self.config["timerange"])
|
2022-05-03 08:14:17 +00:00
|
|
|
timerange_train = copy.deepcopy(full_timerange)
|
|
|
|
timerange_backtest = copy.deepcopy(full_timerange)
|
|
|
|
|
|
|
|
tr_training_list = []
|
|
|
|
tr_backtesting_list = []
|
|
|
|
first = True
|
2022-05-06 10:54:49 +00:00
|
|
|
# within_config_timerange = True
|
2022-05-03 08:14:17 +00:00
|
|
|
while True:
|
2022-05-04 15:42:34 +00:00
|
|
|
if not first:
|
|
|
|
timerange_train.startts = timerange_train.startts + bt_period
|
2022-05-03 08:14:17 +00:00
|
|
|
timerange_train.stopts = timerange_train.startts + train_period
|
|
|
|
|
|
|
|
first = False
|
|
|
|
start = datetime.datetime.utcfromtimestamp(timerange_train.startts)
|
|
|
|
stop = datetime.datetime.utcfromtimestamp(timerange_train.stopts)
|
2022-05-04 15:42:34 +00:00
|
|
|
tr_training_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d"))
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
# associated backtest period
|
2022-05-06 11:06:54 +00:00
|
|
|
|
|
|
|
timerange_backtest.startts = timerange_train.stopts
|
|
|
|
|
2022-05-06 13:10:11 +00:00
|
|
|
timerange_backtest.stopts = timerange_backtest.startts + bt_period
|
|
|
|
|
2022-05-06 10:54:49 +00:00
|
|
|
if timerange_backtest.stopts > config_timerange.stopts:
|
|
|
|
timerange_backtest.stopts = config_timerange.stopts
|
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
start = datetime.datetime.utcfromtimestamp(timerange_backtest.startts)
|
|
|
|
stop = datetime.datetime.utcfromtimestamp(timerange_backtest.stopts)
|
2022-05-04 15:42:34 +00:00
|
|
|
tr_backtesting_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d"))
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-06 10:54:49 +00:00
|
|
|
# ensure we are predicting on exactly same amount of data as requested by user defined
|
|
|
|
# --timerange
|
|
|
|
if timerange_backtest.stopts == config_timerange.stopts:
|
|
|
|
break
|
|
|
|
|
2022-05-06 13:10:11 +00:00
|
|
|
print(tr_training_list, tr_backtesting_list)
|
2022-05-03 08:14:17 +00:00
|
|
|
return tr_training_list, tr_backtesting_list
|
|
|
|
|
|
|
|
def slice_dataframe(self, tr: str, df: DataFrame) -> DataFrame:
|
|
|
|
"""
|
|
|
|
Given a full dataframe, extract the user desired window
|
|
|
|
:params:
|
|
|
|
:tr: timerange string that we wish to extract from df
|
|
|
|
:df: Dataframe containing all candles to run the entire backtest. Here
|
|
|
|
it is sliced down to just the present training period.
|
|
|
|
"""
|
|
|
|
timerange = TimeRange.parse_timerange(tr)
|
|
|
|
start = datetime.datetime.fromtimestamp(timerange.startts, tz=datetime.timezone.utc)
|
|
|
|
stop = datetime.datetime.fromtimestamp(timerange.stopts, tz=datetime.timezone.utc)
|
2022-05-04 15:42:34 +00:00
|
|
|
df = df.loc[df["date"] >= start, :]
|
|
|
|
df = df.loc[df["date"] <= stop, :]
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
def principal_component_analysis(self) -> None:
|
|
|
|
"""
|
|
|
|
Performs Principal Component Analysis on the data for dimensionality reduction
|
|
|
|
and outlier detection (see self.remove_outliers())
|
|
|
|
No parameters or returns, it acts on the data_dictionary held by the DataHandler.
|
|
|
|
"""
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
from sklearn.decomposition import PCA # avoid importing if we dont need it
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
n_components = self.data_dictionary["train_features"].shape[1]
|
2022-05-03 08:14:17 +00:00
|
|
|
pca = PCA(n_components=n_components)
|
2022-05-04 15:42:34 +00:00
|
|
|
pca = pca.fit(self.data_dictionary["train_features"])
|
2022-05-03 08:14:17 +00:00
|
|
|
n_keep_components = np.argmin(pca.explained_variance_ratio_.cumsum() < 0.999)
|
|
|
|
pca2 = PCA(n_components=n_keep_components)
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data["n_kept_components"] = n_keep_components
|
|
|
|
pca2 = pca2.fit(self.data_dictionary["train_features"])
|
2022-05-05 12:37:37 +00:00
|
|
|
logger.info("reduced feature dimension by %s", n_components - n_keep_components)
|
|
|
|
logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_))
|
2022-05-04 15:42:34 +00:00
|
|
|
train_components = pca2.transform(self.data_dictionary["train_features"])
|
|
|
|
test_components = pca2.transform(self.data_dictionary["test_features"])
|
|
|
|
|
|
|
|
self.data_dictionary["train_features"] = pd.DataFrame(
|
|
|
|
data=train_components,
|
|
|
|
columns=["PC" + str(i) for i in range(0, n_keep_components)],
|
|
|
|
index=self.data_dictionary["train_features"].index,
|
|
|
|
)
|
|
|
|
|
|
|
|
self.data_dictionary["test_features"] = pd.DataFrame(
|
|
|
|
data=test_components,
|
|
|
|
columns=["PC" + str(i) for i in range(0, n_keep_components)],
|
|
|
|
index=self.data_dictionary["test_features"].index,
|
|
|
|
)
|
|
|
|
|
|
|
|
self.data["n_kept_components"] = n_keep_components
|
2022-05-03 08:14:17 +00:00
|
|
|
self.pca = pca2
|
2022-05-04 15:42:34 +00:00
|
|
|
|
|
|
|
if not self.model_path.is_dir():
|
|
|
|
self.model_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
pk.dump(pca2, open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "wb"))
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
def compute_distances(self) -> float:
|
2022-05-04 15:53:40 +00:00
|
|
|
logger.info("computing average mean distance for all training points")
|
2022-05-04 15:42:34 +00:00
|
|
|
pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=-1)
|
2022-05-03 08:14:17 +00:00
|
|
|
avg_mean_dist = pairwise.mean(axis=1).mean()
|
2022-05-05 12:37:37 +00:00
|
|
|
logger.info("avg_mean_dist %s", avg_mean_dist)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return avg_mean_dist
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def remove_outliers(self, predict: bool) -> None:
|
2022-05-03 08:14:17 +00:00
|
|
|
"""
|
2022-05-04 15:42:34 +00:00
|
|
|
Remove data that looks like an outlier based on the distribution of each
|
|
|
|
variable.
|
2022-05-03 08:14:17 +00:00
|
|
|
:params:
|
2022-05-04 15:42:34 +00:00
|
|
|
:predict: boolean which tells the function if this is prediction data or
|
|
|
|
training data coming in.
|
2022-05-03 08:14:17 +00:00
|
|
|
"""
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
lower_quantile = self.data_dictionary["train_features"].quantile(0.001)
|
|
|
|
upper_quantile = self.data_dictionary["train_features"].quantile(0.999)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
if predict:
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
df = self.data_dictionary["prediction_features"][
|
|
|
|
(self.data_dictionary["prediction_features"] < upper_quantile)
|
|
|
|
& (self.data_dictionary["prediction_features"] > lower_quantile)
|
|
|
|
]
|
2022-05-03 08:14:17 +00:00
|
|
|
drop_index = pd.isnull(df).any(1)
|
2022-05-04 15:42:34 +00:00
|
|
|
self.data_dictionary["prediction_features"].fillna(0, inplace=True)
|
2022-05-03 08:14:17 +00:00
|
|
|
drop_index = ~drop_index
|
2022-05-04 15:42:34 +00:00
|
|
|
do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
|
|
|
|
|
2022-05-04 15:53:40 +00:00
|
|
|
logger.info(
|
2022-05-05 12:37:37 +00:00
|
|
|
"remove_outliers() tossed %s predictions",
|
2022-05-04 15:42:34 +00:00
|
|
|
len(do_predict) - do_predict.sum(),
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
self.do_predict += do_predict
|
|
|
|
self.do_predict -= 1
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
filter_train_df = self.data_dictionary["train_features"][
|
|
|
|
(self.data_dictionary["train_features"] < upper_quantile)
|
|
|
|
& (self.data_dictionary["train_features"] > lower_quantile)
|
|
|
|
]
|
2022-05-03 08:14:17 +00:00
|
|
|
drop_index = pd.isnull(filter_train_df).any(1)
|
2022-05-04 15:42:34 +00:00
|
|
|
drop_index = drop_index.replace(True, 1).replace(False, 0)
|
|
|
|
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
|
|
|
|
(drop_index == 0)
|
|
|
|
]
|
|
|
|
self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
|
|
|
|
(drop_index == 0)
|
|
|
|
]
|
|
|
|
self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
|
|
|
|
(drop_index == 0)
|
|
|
|
]
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
# do the same for the test data
|
2022-05-04 15:42:34 +00:00
|
|
|
filter_test_df = self.data_dictionary["test_features"][
|
|
|
|
(self.data_dictionary["test_features"] < upper_quantile)
|
|
|
|
& (self.data_dictionary["test_features"] > lower_quantile)
|
|
|
|
]
|
2022-05-03 08:14:17 +00:00
|
|
|
drop_index = pd.isnull(filter_test_df).any(1)
|
2022-05-04 15:42:34 +00:00
|
|
|
drop_index = drop_index.replace(True, 1).replace(False, 0)
|
|
|
|
self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
|
|
|
|
(drop_index == 0)
|
|
|
|
]
|
|
|
|
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
|
|
|
|
(drop_index == 0)
|
|
|
|
]
|
|
|
|
self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
|
|
|
|
(drop_index == 0)
|
|
|
|
]
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
def build_feature_list(self, config: dict) -> list:
|
2022-05-03 08:14:17 +00:00
|
|
|
"""
|
2022-05-04 15:42:34 +00:00
|
|
|
Build the list of features that will be used to filter
|
|
|
|
the full dataframe. Feature list is construced from the
|
2022-05-03 08:14:17 +00:00
|
|
|
user configuration file.
|
|
|
|
:params:
|
|
|
|
:config: Canonical freqtrade config file containing all
|
|
|
|
user defined input in config['freqai] dictionary.
|
|
|
|
"""
|
|
|
|
features = []
|
2022-05-04 15:42:34 +00:00
|
|
|
for tf in config["freqai"]["timeframes"]:
|
|
|
|
for ft in config["freqai"]["base_features"]:
|
|
|
|
for n in range(config["freqai"]["feature_parameters"]["shift"] + 1):
|
|
|
|
shift = ""
|
|
|
|
if n > 0:
|
|
|
|
shift = "_shift-" + str(n)
|
|
|
|
features.append(ft + shift + "_" + tf)
|
|
|
|
for p in config["freqai"]["corr_pairlist"]:
|
|
|
|
features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
|
|
|
|
|
2022-05-05 12:37:37 +00:00
|
|
|
logger.info("number of features %s", len(features))
|
2022-05-03 08:14:17 +00:00
|
|
|
return features
|
|
|
|
|
|
|
|
def check_if_pred_in_training_spaces(self) -> None:
|
|
|
|
"""
|
2022-05-04 15:42:34 +00:00
|
|
|
Compares the distance from each prediction point to each training data
|
2022-05-03 08:14:17 +00:00
|
|
|
point. It uses this information to estimate a Dissimilarity Index (DI)
|
2022-05-04 15:42:34 +00:00
|
|
|
and avoid making predictions on any points that are too far away
|
|
|
|
from the training data set.
|
2022-05-03 08:14:17 +00:00
|
|
|
"""
|
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
distance = pairwise_distances(
|
|
|
|
self.data_dictionary["train_features"],
|
|
|
|
self.data_dictionary["prediction_features"],
|
|
|
|
n_jobs=-1,
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
do_predict = np.where(
|
|
|
|
distance.min(axis=0) / self.data["avg_mean_dist"]
|
|
|
|
< self.config["freqai"]["feature_parameters"]["DI_threshold"],
|
|
|
|
1,
|
|
|
|
0,
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:53:40 +00:00
|
|
|
logger.info(
|
2022-05-05 12:37:37 +00:00
|
|
|
"Distance checker tossed %s predictions for being too far from training data",
|
2022-05-04 15:42:34 +00:00
|
|
|
len(do_predict) - do_predict.sum(),
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-04 15:42:34 +00:00
|
|
|
self.do_predict += do_predict
|
2022-05-03 08:14:17 +00:00
|
|
|
self.do_predict -= 1
|
2022-05-04 15:42:34 +00:00
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
def set_weights_higher_recent(self, num_weights: int) -> int:
|
|
|
|
"""
|
|
|
|
Set weights so that recent data is more heavily weighted during
|
|
|
|
training than older data.
|
|
|
|
"""
|
|
|
|
weights = np.zeros(num_weights)
|
|
|
|
for i in range(1, len(weights)):
|
2022-05-04 15:42:34 +00:00
|
|
|
weights[len(weights) - i] = np.exp(
|
|
|
|
-i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights)
|
|
|
|
)
|
2022-05-03 08:14:17 +00:00
|
|
|
return weights
|
|
|
|
|
|
|
|
def append_predictions(self, predictions, do_predict, len_dataframe):
|
|
|
|
"""
|
|
|
|
Append backtest prediction from current backtest period to all previous periods
|
|
|
|
"""
|
|
|
|
|
|
|
|
ones = np.ones(len_dataframe)
|
2022-05-04 15:42:34 +00:00
|
|
|
s_mean, s_std = ones * self.data["s_mean"], ones * self.data["s_std"]
|
2022-05-03 08:14:17 +00:00
|
|
|
|
2022-05-06 13:10:11 +00:00
|
|
|
self.full_predictions = np.append(self.full_predictions, predictions)
|
|
|
|
self.full_do_predict = np.append(self.full_do_predict, do_predict)
|
|
|
|
self.full_target_mean = np.append(self.full_target_mean, s_mean)
|
|
|
|
self.full_target_std = np.append(self.full_target_std, s_std)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
def fill_predictions(self, len_dataframe):
|
|
|
|
"""
|
|
|
|
Back fill values to before the backtesting range so that the dataframe matches size
|
|
|
|
when it goes back to the strategy. These rows are not included in the backtest.
|
|
|
|
"""
|
|
|
|
|
2022-05-06 13:10:11 +00:00
|
|
|
filler = np.zeros(len_dataframe - len(self.full_predictions)) # startup_candle_count
|
|
|
|
self.full_predictions = np.append(filler, self.full_predictions)
|
|
|
|
self.full_do_predict = np.append(filler, self.full_do_predict)
|
|
|
|
self.full_target_mean = np.append(filler, self.full_target_mean)
|
|
|
|
self.full_target_std = np.append(filler, self.full_target_std)
|
2022-05-03 08:14:17 +00:00
|
|
|
|
|
|
|
return
|
2022-05-04 15:42:34 +00:00
|
|
|
|
2022-05-05 13:35:51 +00:00
|
|
|
def create_fulltimerange(self, backtest_tr: str, backtest_period: int) -> str:
|
|
|
|
backtest_timerange = TimeRange.parse_timerange(backtest_tr)
|
|
|
|
|
|
|
|
backtest_timerange.startts = backtest_timerange.startts - backtest_period * SECONDS_IN_DAY
|
|
|
|
start = datetime.datetime.utcfromtimestamp(backtest_timerange.startts)
|
|
|
|
stop = datetime.datetime.utcfromtimestamp(backtest_timerange.stopts)
|
|
|
|
full_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
|
|
|
|
|
|
|
|
self.full_path = Path(
|
|
|
|
self.config["user_data_dir"]
|
|
|
|
/ "models"
|
|
|
|
/ str(full_timerange + self.freqai_config["identifier"])
|
|
|
|
)
|
|
|
|
|
|
|
|
if not self.full_path.is_dir():
|
|
|
|
self.full_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
shutil.copy(
|
|
|
|
Path(self.config["config_files"][0]).name,
|
|
|
|
Path(self.full_path / self.config["config_files"][0]),
|
|
|
|
)
|
|
|
|
|
|
|
|
return full_timerange
|
|
|
|
|
2022-05-03 08:14:17 +00:00
|
|
|
def np_encoder(self, object):
|
|
|
|
if isinstance(object, np.generic):
|
|
|
|
return object.item()
|