freqtrade_origin/freqtrade/freqai/base_models/BasePyTorchClassifier.py

215 lines
7.8 KiB
Python
Raw Normal View History

2023-03-05 14:59:24 +00:00
import logging
from time import time
from typing import Any, Dict, List, Tuple
2023-03-05 14:59:24 +00:00
import numpy as np
2023-03-08 14:03:36 +00:00
import numpy.typing as npt
2023-03-05 14:59:24 +00:00
import pandas as pd
import torch
from pandas import DataFrame
from torch.nn import functional as F
2023-03-08 14:11:51 +00:00
from freqtrade.exceptions import OperationalException
from freqtrade.freqai.base_models.BasePyTorchModel import BasePyTorchModel
2023-03-08 14:03:36 +00:00
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
2023-03-05 14:59:24 +00:00
logger = logging.getLogger(__name__)
2023-03-22 15:50:00 +00:00
class BasePyTorchClassifier(BasePyTorchModel):
2023-03-09 09:14:54 +00:00
"""
A PyTorch implementation of a classifier.
User must implement fit method
Important!
- User must declare the target class names in the strategy,
under IStrategy.set_freqai_targets method.
for example, in your strategy:
```
def set_freqai_targets(self, dataframe: DataFrame, metadata: Dict, **kwargs):
self.freqai.class_names = ["down", "up"]
dataframe['&s-up_or_down'] = np.where(dataframe["close"].shift(-100) >
dataframe["close"], 'up', 'down')
return dataframe
2023-03-09 09:14:54 +00:00
"""
2023-03-05 14:59:24 +00:00
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.class_name_to_index = None
self.index_to_class_name = None
2023-03-05 14:59:24 +00:00
def predict(
self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs
) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
"""
Filter the prediction features data and predict with it.
:param dk: dk: The datakitchen object
2023-03-05 14:59:24 +00:00
:param unfiltered_df: Full dataframe for the current backtest period.
:return:
:pred_df: dataframe containing the predictions
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
data (NaNs) or felt uncertain about data (PCA and DI index)
:raises ValueError: if 'class_names' doesn't exist in model meta_data.
2023-03-05 14:59:24 +00:00
"""
2023-03-09 12:55:52 +00:00
class_names = self.model.model_meta_data.get("class_names", None)
if not class_names:
raise ValueError(
2024-05-12 15:51:21 +00:00
"Missing class names. self.model.model_meta_data['class_names'] is None."
)
if not self.class_name_to_index:
self.init_class_names_to_index_mapping(class_names)
2023-03-05 14:59:24 +00:00
dk.find_features(unfiltered_df)
filtered_df, _ = dk.filter_features(
unfiltered_df, dk.training_features_list, training_filter=False
)
2023-03-05 14:59:24 +00:00
dk.data_dictionary["prediction_features"] = filtered_df
dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform(
2024-05-12 15:12:20 +00:00
dk.data_dictionary["prediction_features"], outlier_check=True
)
2023-04-03 12:19:10 +00:00
x = self.data_convertor.convert_x(
2024-05-12 15:12:20 +00:00
dk.data_dictionary["prediction_features"], device=self.device
2023-04-03 12:19:10 +00:00
)
self.model.model.eval()
logits = self.model.model(x)
2023-03-05 14:59:24 +00:00
probs = F.softmax(logits, dim=-1)
predicted_classes = torch.argmax(probs, dim=-1)
predicted_classes_str = self.decode_class_names(predicted_classes)
# used .tolist to convert probs into an iterable, in this way Tensors
# are automatically moved to the CPU first if necessary.
pred_df_prob = DataFrame(probs.detach().tolist(), columns=class_names)
pred_df = DataFrame(predicted_classes_str, columns=[dk.label_list[0]])
2023-03-05 14:59:24 +00:00
pred_df = pd.concat([pred_df, pred_df_prob], axis=1)
2023-06-18 09:30:33 +00:00
if dk.feature_pipeline["di"]:
dk.DI_values = dk.feature_pipeline["di"].di_values
else:
dk.DI_values = np.zeros(outliers.shape[0])
dk.do_predict = outliers
2023-03-05 14:59:24 +00:00
return (pred_df, dk.do_predict)
def encode_class_names(
2024-05-12 15:12:20 +00:00
self,
data_dictionary: Dict[str, pd.DataFrame],
dk: FreqaiDataKitchen,
class_names: List[str],
):
"""
encode class name, str -> int
assuming first column of *_labels data frame to be the target column
containing the class names
"""
2023-03-09 12:55:52 +00:00
target_column_name = dk.label_list[0]
2023-03-28 11:40:23 +00:00
for split in self.splits:
label_df = data_dictionary[f"{split}_labels"]
self.assert_valid_class_names(label_df[target_column_name], class_names)
label_df[target_column_name] = list(
map(lambda x: self.class_name_to_index[x], label_df[target_column_name])
)
@staticmethod
2024-05-12 15:12:20 +00:00
def assert_valid_class_names(target_column: pd.Series, class_names: List[str]):
non_defined_labels = set(target_column) - set(class_names)
if len(non_defined_labels) != 0:
raise OperationalException(
2023-03-08 13:38:22 +00:00
f"Found non defined labels: {non_defined_labels}, ",
2024-05-12 15:12:20 +00:00
f"expecting labels: {class_names}",
)
def decode_class_names(self, class_ints: torch.Tensor) -> List[str]:
2023-03-08 13:38:22 +00:00
"""
decode class name, int -> str
2023-03-08 13:38:22 +00:00
"""
2023-03-09 12:55:52 +00:00
return list(map(lambda x: self.index_to_class_name[x.item()], class_ints))
def init_class_names_to_index_mapping(self, class_names):
self.class_name_to_index = {s: i for i, s in enumerate(class_names)}
self.index_to_class_name = {i: s for i, s in enumerate(class_names)}
logger.info(f"encoded class name to index: {self.class_name_to_index}")
def convert_label_column_to_int(
2024-05-12 15:12:20 +00:00
self,
data_dictionary: Dict[str, pd.DataFrame],
dk: FreqaiDataKitchen,
class_names: List[str],
):
self.init_class_names_to_index_mapping(class_names)
self.encode_class_names(data_dictionary, dk, class_names)
def get_class_names(self) -> List[str]:
if not self.class_names:
raise ValueError(
"self.class_names is empty, "
"set self.freqai.class_names = ['class a', 'class b', 'class c'] "
"inside IStrategy.set_freqai_targets method."
)
return self.class_names
2024-05-12 15:12:20 +00:00
def train(self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs) -> Any:
"""
Filter the training data and train a model to it. Train makes heavy use of the datakitchen
for storing, saving, loading, and analyzing the data.
:param unfiltered_df: Full dataframe for the current training period
:return:
:model: Trained model which can be used to inference (self.predict)
"""
logger.info(f"-------------------- Starting training {pair} --------------------")
start_time = time()
features_filtered, labels_filtered = dk.filter_features(
unfiltered_df,
dk.training_features_list,
dk.label_list,
training_filter=True,
)
# split data into train/test data.
dd = dk.make_train_test_datasets(features_filtered, labels_filtered)
if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:
dk.fit_labels()
dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count)
2024-05-12 15:12:20 +00:00
(dd["train_features"], dd["train_labels"], dd["train_weights"]) = (
dk.feature_pipeline.fit_transform(
dd["train_features"], dd["train_labels"], dd["train_weights"]
)
)
2024-05-12 15:12:20 +00:00
if self.freqai_info.get("data_split_parameters", {}).get("test_size", 0.1) != 0:
(dd["test_features"], dd["test_labels"], dd["test_weights"]) = (
dk.feature_pipeline.transform(
dd["test_features"], dd["test_labels"], dd["test_weights"]
)
)
logger.info(
f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
)
logger.info(f"Training model on {len(dd['train_features'])} data points")
model = self.fit(dd, dk)
end_time = time()
2024-05-12 15:12:20 +00:00
logger.info(
f"-------------------- Done training {pair} "
f"({end_time - start_time:.2f} secs) --------------------"
)
return model