freqtrade_origin/freqtrade/freqai/base_models/BasePyTorchClassifier.py

import logging
from time import time
from typing import Any, Dict, List, Tuple

import numpy as np
import numpy.typing as npt
import pandas as pd
import torch
from pandas import DataFrame
from torch.nn import functional as F

from freqtrade.exceptions import OperationalException
from freqtrade.freqai.base_models.BasePyTorchModel import BasePyTorchModel
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen


logger = logging.getLogger(__name__)


class BasePyTorchClassifier(BasePyTorchModel):
    """
    A PyTorch implementation of a classifier.
    User must implement fit method

    Important!

    - User must declare the target class names in the strategy,
    under IStrategy.set_freqai_targets method.

    for example, in your strategy:
    ```
        def set_freqai_targets(self, dataframe: DataFrame, metadata: Dict, **kwargs):
            self.freqai.class_names = ["down", "up"]
            dataframe['&s-up_or_down'] = np.where(dataframe["close"].shift(-100) >
                                                  dataframe["close"], 'up', 'down')

            return dataframe
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.class_name_to_index = None
        self.index_to_class_name = None

    def predict(
        self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs
    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
        """
        Filter the prediction features data and predict with it.
        :param dk: dk: The datakitchen object
        :param unfiltered_df: Full dataframe for the current backtest period.
        :return:
        :pred_df: dataframe containing the predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (PCA and DI index)
        :raises ValueError: if 'class_names' doesn't exist in model meta_data.
        """

        class_names = self.model.model_meta_data.get("class_names", None)
        if not class_names:
            raise ValueError(
                "Missing class names. "
                "self.model.model_meta_data['class_names'] is None."
            )

        if not self.class_name_to_index:
            self.init_class_names_to_index_mapping(class_names)

        dk.find_features(unfiltered_df)
        filtered_df, _ = dk.filter_features(
            unfiltered_df, dk.training_features_list, training_filter=False
        )

        dk.data_dictionary["prediction_features"] = filtered_df

        dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform(
            dk.data_dictionary["prediction_features"], outlier_check=True)

        x = self.data_convertor.convert_x(
            dk.data_dictionary["prediction_features"],
            device=self.device
        )
        self.model.model.eval()
        logits = self.model.model(x)
        probs = F.softmax(logits, dim=-1)
        predicted_classes = torch.argmax(probs, dim=-1)
        predicted_classes_str = self.decode_class_names(predicted_classes)
        # used .tolist to convert probs into an iterable, in this way Tensors
        # are automatically moved to the CPU first if necessary.
        pred_df_prob = DataFrame(probs.detach().tolist(), columns=class_names)
        pred_df = DataFrame(predicted_classes_str, columns=[dk.label_list[0]])
        pred_df = pd.concat([pred_df, pred_df_prob], axis=1)

        if dk.feature_pipeline["di"]:
            dk.DI_values = dk.feature_pipeline["di"].di_values
        else:
            dk.DI_values = np.zeros(len(outliers.index))
        dk.do_predict = outliers.to_numpy()

        return (pred_df, dk.do_predict)

    def encode_class_names(
            self,
            data_dictionary: Dict[str, pd.DataFrame],
            dk: FreqaiDataKitchen,
            class_names: List[str],
    ):
        """
        encode class name, str -> int
        assuming first column of *_labels data frame to be the target column
        containing the class names
        """

        target_column_name = dk.label_list[0]
        for split in self.splits:
            label_df = data_dictionary[f"{split}_labels"]
            self.assert_valid_class_names(label_df[target_column_name], class_names)
            label_df[target_column_name] = list(
                map(lambda x: self.class_name_to_index[x], label_df[target_column_name])
            )

    @staticmethod
    def assert_valid_class_names(
            target_column: pd.Series,
            class_names: List[str]
    ):
        non_defined_labels = set(target_column) - set(class_names)
        if len(non_defined_labels) != 0:
            raise OperationalException(
                f"Found non defined labels: {non_defined_labels}, ",
                f"expecting labels: {class_names}"
            )

    def decode_class_names(self, class_ints: torch.Tensor) -> List[str]:
        """
        decode class name, int -> str
        """

        return list(map(lambda x: self.index_to_class_name[x.item()], class_ints))

    def init_class_names_to_index_mapping(self, class_names):
        self.class_name_to_index = {s: i for i, s in enumerate(class_names)}
        self.index_to_class_name = {i: s for i, s in enumerate(class_names)}
        logger.info(f"encoded class name to index: {self.class_name_to_index}")

    def convert_label_column_to_int(
            self,
            data_dictionary: Dict[str, pd.DataFrame],
            dk: FreqaiDataKitchen,
            class_names: List[str]
    ):
        self.init_class_names_to_index_mapping(class_names)
        self.encode_class_names(data_dictionary, dk, class_names)

    def get_class_names(self) -> List[str]:
        if not self.class_names:
            raise ValueError(
                "self.class_names is empty, "
                "set self.freqai.class_names = ['class a', 'class b', 'class c'] "
                "inside IStrategy.set_freqai_targets method."
            )

        return self.class_names

    def train(
        self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
    ) -> Any:
        """
        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
        for storing, saving, loading, and analyzing the data.
        :param unfiltered_df: Full dataframe for the current training period
        :return:
        :model: Trained model which can be used to inference (self.predict)
        """

        logger.info(f"-------------------- Starting training {pair} --------------------")

        start_time = time()

        features_filtered, labels_filtered = dk.filter_features(
            unfiltered_df,
            dk.training_features_list,
            dk.label_list,
            training_filter=True,
        )

        # split data into train/test data.
        dd = dk.make_train_test_datasets(features_filtered, labels_filtered)
        if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:
            dk.fit_labels()

        dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count)

        (dd["train_features"],
         dd["train_labels"],
         dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"],
                                                                  dd["train_labels"],
                                                                  dd["train_weights"])

        if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
            (dd["test_features"],
             dd["test_labels"],
             dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"],
                                                                 dd["test_labels"],
                                                                 dd["test_weights"])

        logger.info(
            f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
        )
        logger.info(f"Training model on {len(dd['train_features'])} data points")

        model = self.fit(dd, dk)
        end_time = time()

        logger.info(f"-------------------- Done training {pair} "
                    f"({end_time - start_time:.2f} secs) --------------------")

        return model
initial commit 2023-03-05 14:59:24 +00:00			`import logging`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00			`from time import time`
			`from typing import Any, Dict, List, Tuple`
initial commit 2023-03-05 14:59:24 +00:00
			`import numpy as np`
sort imports 2023-03-08 14:03:36 +00:00			`import numpy.typing as npt`
initial commit 2023-03-05 14:59:24 +00:00			`import pandas as pd`
			`import torch`
			`from pandas import DataFrame`
			`from torch.nn import functional as F`

add missing import 2023-03-08 14:11:51 +00:00			`from freqtrade.exceptions import OperationalException`
use data loader, add evaluation on epoch 2023-03-06 14:16:45 +00:00			`from freqtrade.freqai.base_models.BasePyTorchModel import BasePyTorchModel`
sort imports 2023-03-08 14:03:36 +00:00			`from freqtrade.freqai.data_kitchen import FreqaiDataKitchen`
use data loader, add evaluation on epoch 2023-03-06 14:16:45 +00:00
initial commit 2023-03-05 14:59:24 +00:00
			`logger = logging.getLogger(__name__)`


rename Torch to PyTorch 2023-03-22 15:50:00 +00:00			`class BasePyTorchClassifier(BasePyTorchModel):`
add documentation 2023-03-09 09:14:54 +00:00			`"""`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`A PyTorch implementation of a classifier.`
			`User must implement fit method`
improve pytorch classifier documentation 2023-03-20 16:39:50 +00:00
			`Important!`
add class_name attribute to freqai interface 2023-03-20 18:38:43 +00:00
			`- User must declare the target class names in the strategy,`
			`under IStrategy.set_freqai_targets method.`

			`for example, in your strategy:`
improve pytorch classifier documentation 2023-03-20 16:39:50 +00:00			```
			`def set_freqai_targets(self, dataframe: DataFrame, metadata: Dict, **kwargs):`
			`self.freqai.class_names = ["down", "up"]`
			`dataframe['&s-up_or_down'] = np.where(dataframe["close"].shift(-100) >`
			`dataframe["close"], 'up', 'down')`

			`return dataframe`
add documentation 2023-03-09 09:14:54 +00:00			`"""`
update docs, improve the interaction with `define_data_pipeline` 2023-06-07 16:26:49 +00:00
initial commit 2023-03-05 14:59:24 +00:00			`def __init__(self, **kwargs):`
			`super().__init__(**kwargs)`
set class names in IStrategy.set_freqai_targets method, also save class name with model meta data 2023-03-08 16:36:44 +00:00			`self.class_name_to_index = None`
			`self.index_to_class_name = None`
initial commit 2023-03-05 14:59:24 +00:00
			`def predict(`
			`self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs`
			`) -> Tuple[DataFrame, npt.NDArray[np.int_]]:`
			`"""`
			`Filter the prediction features data and predict with it.`
refactor(BasePyTorchClassifier.py): convert tensor to list before creating DataFrame to avoid TypeError. docs(BasePyTorchClassifier.py): add missing parameter description in predict method 2023-05-05 11:04:53 +00:00			`:param dk: dk: The datakitchen object`
initial commit 2023-03-05 14:59:24 +00:00			`:param unfiltered_df: Full dataframe for the current backtest period.`
			`:return:`
			`:pred_df: dataframe containing the predictions`
			`:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove`
			`data (NaNs) or felt uncertain about data (PCA and DI index)`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`:raises ValueError: if 'class_names' doesn't exist in model meta_data.`
initial commit 2023-03-05 14:59:24 +00:00			`"""`
improve documentation 2023-03-09 12:55:52 +00:00
set class names in IStrategy.set_freqai_targets method, also save class name with model meta data 2023-03-08 16:36:44 +00:00			`class_names = self.model.model_meta_data.get("class_names", None)`
			`if not class_names:`
			`raise ValueError(`
			`"Missing class names. "`
add class_name attribute to freqai interface 2023-03-20 18:38:43 +00:00			`"self.model.model_meta_data['class_names'] is None."`
set class names in IStrategy.set_freqai_targets method, also save class name with model meta data 2023-03-08 16:36:44 +00:00			`)`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00
			`if not self.class_name_to_index:`
			`self.init_class_names_to_index_mapping(class_names)`
initial commit 2023-03-05 14:59:24 +00:00
			`dk.find_features(unfiltered_df)`
			`filtered_df, _ = dk.filter_features(`
			`unfiltered_df, dk.training_features_list, training_filter=False`
			`)`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
initial commit 2023-03-05 14:59:24 +00:00			`dk.data_dictionary["prediction_features"] = filtered_df`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
			`dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform(`
			`dk.data_dictionary["prediction_features"], outlier_check=True)`

add pytorch data convertor 2023-04-03 12:19:10 +00:00			`x = self.data_convertor.convert_x(`
			`dk.data_dictionary["prediction_features"],`
			`device=self.device`
			`)`
add transformer with positional encoding, fix some odds and ends in pytorch, upgrade to PyTorch 2.0 2023-05-01 13:18:03 +00:00			`self.model.model.eval()`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`logits = self.model.model(x)`
initial commit 2023-03-05 14:59:24 +00:00			`probs = F.softmax(logits, dim=-1)`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`predicted_classes = torch.argmax(probs, dim=-1)`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`predicted_classes_str = self.decode_class_names(predicted_classes)`
refactor(BasePyTorchClassifier.py): convert tensor to list before creating DataFrame to avoid TypeError. docs(BasePyTorchClassifier.py): add missing parameter description in predict method 2023-05-05 11:04:53 +00:00			`# used .tolist to convert probs into an iterable, in this way Tensors`
			`# are automatically moved to the CPU first if necessary.`
			`pred_df_prob = DataFrame(probs.detach().tolist(), columns=class_names)`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`pred_df = DataFrame(predicted_classes_str, columns=[dk.label_list[0]])`
initial commit 2023-03-05 14:59:24 +00:00			`pred_df = pd.concat([pred_df, pred_df_prob], axis=1)`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
Update BasePyTorchClassifier.py 2023-06-18 09:30:33 +00:00			`if dk.feature_pipeline["di"]:`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00			`dk.DI_values = dk.feature_pipeline["di"].di_values`
			`else:`
			`dk.DI_values = np.zeros(len(outliers.index))`
			`dk.do_predict = outliers.to_numpy()`

initial commit 2023-03-05 14:59:24 +00:00			`return (pred_df, dk.do_predict)`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`def encode_class_names(`
			`self,`
			`data_dictionary: Dict[str, pd.DataFrame],`
			`dk: FreqaiDataKitchen,`
			`class_names: List[str],`
			`):`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`"""`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`encode class name, str -> int`
			`assuming first column of *_labels data frame to be the target column`
			`containing the class names`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`"""`
improve documentation 2023-03-09 12:55:52 +00:00
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`target_column_name = dk.label_list[0]`
bugfix skip test split when empty 2023-03-28 11:40:23 +00:00			`for split in self.splits:`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`label_df = data_dictionary[f"{split}_labels"]`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`self.assert_valid_class_names(label_df[target_column_name], class_names)`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`label_df[target_column_name] = list(`
			`map(lambda x: self.class_name_to_index[x], label_df[target_column_name])`
			`)`

create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`@staticmethod`
			`def assert_valid_class_names(`
			`target_column: pd.Series,`
			`class_names: List[str]`
			`):`
			`non_defined_labels = set(target_column) - set(class_names)`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`if len(non_defined_labels) != 0:`
			`raise OperationalException(`
change documentation and small bugfix 2023-03-08 13:38:22 +00:00			`f"Found non defined labels: {non_defined_labels}, ",`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`f"expecting labels: {class_names}"`
ad multiclass target names encoder to ints 2023-03-08 12:29:38 +00:00			`)`

create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`def decode_class_names(self, class_ints: torch.Tensor) -> List[str]:`
change documentation and small bugfix 2023-03-08 13:38:22 +00:00			`"""`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`decode class name, int -> str`
change documentation and small bugfix 2023-03-08 13:38:22 +00:00			`"""`
improve documentation 2023-03-09 12:55:52 +00:00
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`return list(map(lambda x: self.index_to_class_name[x.item()], class_ints))`
set class names in IStrategy.set_freqai_targets method, also save class name with model meta data 2023-03-08 16:36:44 +00:00
			`def init_class_names_to_index_mapping(self, class_names):`
			`self.class_name_to_index = {s: i for i, s in enumerate(class_names)}`
			`self.index_to_class_name = {i: s for i, s in enumerate(class_names)}`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`logger.info(f"encoded class name to index: {self.class_name_to_index}")`

			`def convert_label_column_to_int(`
			`self,`
			`data_dictionary: Dict[str, pd.DataFrame],`
			`dk: FreqaiDataKitchen,`
			`class_names: List[str]`
			`):`
			`self.init_class_names_to_index_mapping(class_names)`
			`self.encode_class_names(data_dictionary, dk, class_names)`

			`def get_class_names(self) -> List[str]:`
add class_name attribute to freqai interface 2023-03-20 18:38:43 +00:00			`if not self.class_names:`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`raise ValueError(`
add class_name attribute to freqai interface 2023-03-20 18:38:43 +00:00			`"self.class_names is empty, "`
improve pytorch classifier documentation 2023-03-20 16:39:50 +00:00			`"set self.freqai.class_names = ['class a', 'class b', 'class c'] "`
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`"inside IStrategy.set_freqai_targets method."`
			`)`
add class_name attribute to freqai interface 2023-03-20 18:38:43 +00:00
create children class to PyTorchClassifier to implement the fit method where we initialize the trainer and model objects 2023-03-19 12:38:49 +00:00			`return self.class_names`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
			`def train(`
			`self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs`
			`) -> Any:`
			`"""`
			`Filter the training data and train a model to it. Train makes heavy use of the datakitchen`
			`for storing, saving, loading, and analyzing the data.`
			`:param unfiltered_df: Full dataframe for the current training period`
			`:return:`
			`:model: Trained model which can be used to inference (self.predict)`
			`"""`

			`logger.info(f"-------------------- Starting training {pair} --------------------")`

			`start_time = time()`

			`features_filtered, labels_filtered = dk.filter_features(`
			`unfiltered_df,`
			`dk.training_features_list,`
			`dk.label_list,`
			`training_filter=True,`
			`)`

			`# split data into train/test data.`
update docs, improve the interaction with `define_data_pipeline` 2023-06-07 16:26:49 +00:00			`dd = dk.make_train_test_datasets(features_filtered, labels_filtered)`
			`if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00			`dk.fit_labels()`

ensure data kitchen thread count is propagated to pipeline 2023-06-08 10:33:08 +00:00			`dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count)`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
update docs, improve the interaction with `define_data_pipeline` 2023-06-07 16:26:49 +00:00			`(dd["train_features"],`
			`dd["train_labels"],`
			`dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"],`
			`dd["train_labels"],`
			`dd["train_weights"])`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
fix: ensure test_size=0 is still accommodated 2023-06-17 13:39:33 +00:00			`if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0:`
			`(dd["test_features"],`
			`dd["test_labels"],`
			`dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"],`
			`dd["test_labels"],`
			`dd["test_weights"])`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
			`logger.info(`
			`f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"`
			`)`
update docs, improve the interaction with `define_data_pipeline` 2023-06-07 16:26:49 +00:00			`logger.info(f"Training model on {len(dd['train_features'])} data points")`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00
update docs, improve the interaction with `define_data_pipeline` 2023-06-07 16:26:49 +00:00			`model = self.fit(dd, dk)`
bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform 2023-05-29 11:33:29 +00:00			`end_time = time()`

			`logger.info(f"-------------------- Done training {pair} "`
			`f"({end_time - start_time:.2f} secs) --------------------")`

			`return model`