start transition toward outsourcing the data pipeline with objective of improving pipeline flexibility

2024-11-10 10:21:59 +00:00 · 2023-05-26 18:40:14 +02:00 · 2023-05-26 18:40:14 +02:00 · 31e19add27
commit 31e19add27
parent c23a045de4
8 changed files with 579 additions and 586 deletions
--- a/freqtrade/freqai/base_models/BasePyTorchModel.py
+++ b/freqtrade/freqai/base_models/BasePyTorchModel.py
@ -7,14 +7,15 @@ import torch
 from pandas import DataFrame

 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-from freqtrade.freqai.freqai_interface import IFreqaiModel
+# from freqtrade.freqai.freqai_interface import IFreqaiModel
+from freqtrade.freqai.base_models import BaseRegressionModel
 from freqtrade.freqai.torch.PyTorchDataConvertor import PyTorchDataConvertor


 logger = logging.getLogger(__name__)


-class BasePyTorchModel(IFreqaiModel, ABC):
+class BasePyTorchModel(BaseRegressionModel):
    """
    Base class for PyTorch type models.
    User *must* inherit from this class and set fit() and predict() and
@ -29,50 +30,50 @@ class BasePyTorchModel(IFreqaiModel, ABC):
        self.splits = ["train", "test"] if test_size != 0 else ["train"]
        self.window_size = self.freqai_info.get("conv_width", 1)

-    def train(
-        self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
-    ) -> Any:
-        """
-        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
-        for storing, saving, loading, and analyzing the data.
-        :param unfiltered_df: Full dataframe for the current training period
-        :return:
-        :model: Trained model which can be used to inference (self.predict)
-        """
+    # def train(
+    #     self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
+    # ) -> Any:
+    #     """
+    #     Filter the training data and train a model to it. Train makes heavy use of the datakitchen
+    #     for storing, saving, loading, and analyzing the data.
+    #     :param unfiltered_df: Full dataframe for the current training period
+    #     :return:
+    #     :model: Trained model which can be used to inference (self.predict)
+    #     """

-        logger.info(f"-------------------- Starting training {pair} --------------------")
+    #     logger.info(f"-------------------- Starting training {pair} --------------------")

-        start_time = time()
+    #     start_time = time()

-        features_filtered, labels_filtered = dk.filter_features(
-            unfiltered_df,
-            dk.training_features_list,
-            dk.label_list,
-            training_filter=True,
-        )
+    #     features_filtered, labels_filtered = dk.filter_features(
+    #         unfiltered_df,
+    #         dk.training_features_list,
+    #         dk.label_list,
+    #         training_filter=True,
+    #     )

-        # split data into train/test data.
-        data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
-        if not self.freqai_info.get("fit_live_predictions", 0) or not self.live:
-            dk.fit_labels()
-        # normalize all data based on train_dataset only
-        data_dictionary = dk.normalize_data(data_dictionary)
+    #     # split data into train/test data.
+    #     data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
+    #     if not self.freqai_info.get("fit_live_predictions", 0) or not self.live:
+    #         dk.fit_labels()
+    #     # normalize all data based on train_dataset only
+    #     data_dictionary = dk.normalize_data(data_dictionary)

-        # optional additional data cleaning/analysis
-        self.data_cleaning_train(dk)
+    #     # optional additional data cleaning/analysis
+    #     self.data_cleaning_train(dk)

-        logger.info(
-            f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
-        )
-        logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
+    #     logger.info(
+    #         f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
+    #     )
+    #     logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")

-        model = self.fit(data_dictionary, dk)
-        end_time = time()
+    #     model = self.fit(data_dictionary, dk)
+    #     end_time = time()

-        logger.info(f"-------------------- Done training {pair} "
-                    f"({end_time - start_time:.2f} secs) --------------------")
+    #     logger.info(f"-------------------- Done training {pair} "
+    #                 f"({end_time - start_time:.2f} secs) --------------------")

-        return model
+    #     return model

    @property
    @abstractmethod
--- a/freqtrade/freqai/base_models/BaseRegressionModel.py
+++ b/freqtrade/freqai/base_models/BaseRegressionModel.py
@ -49,21 +49,34 @@ class BaseRegressionModel(IFreqaiModel):
        logger.info(f"-------------------- Training on data from {start_date} to "
                    f"{end_date} --------------------")
        # split data into train/test data.
-        data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
+        d = dk.make_train_test_datasets(features_filtered, labels_filtered)
        if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:
            dk.fit_labels()
-        # normalize all data based on train_dataset only
-        data_dictionary = dk.normalize_data(data_dictionary)

-        # optional additional data cleaning/analysis
-        self.data_cleaning_train(dk)
+        self.define_data_pipeline(dk)
+        self.define_label_pipeline(dk)
+
+        d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"])
+        d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"])
+
+        (d["train_features"],
+         d["train_labels"],
+         d["train_weights"]) = dk.pipeline.fit_transform(d["train_features"],
+                                                         d["train_labels"],
+                                                         d["train_weights"])
+
+        (d["test_features"],
+         d["test_labels"],
+         d["test_weights"]) = dk.pipeline.transform(d["test_features"],
+                                                    d["test_labels"],
+                                                    d["test_weights"])

        logger.info(
            f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
        )
-        logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
+        logger.info(f"Training model on {len(d['train_features'])} data points")

-        model = self.fit(data_dictionary, dk)
+        model = self.fit(d, dk)

        end_time = time()

@ -88,11 +101,11 @@ class BaseRegressionModel(IFreqaiModel):
        filtered_df, _ = dk.filter_features(
            unfiltered_df, dk.training_features_list, training_filter=False
        )
-        filtered_df = dk.normalize_data_from_metadata(filtered_df)
+        # filtered_df = dk.normalize_data_from_metadata(filtered_df)
        dk.data_dictionary["prediction_features"] = filtered_df

-        # optional additional data cleaning/analysis
-        self.data_cleaning_predict(dk)
+        dk.data_dictionary["prediction_features"], outliers, _ = dk.pipeline.transform(
+            dk.data_dictionary["prediction_features"], outlier_check=True)

        predictions = self.model.predict(dk.data_dictionary["prediction_features"])
        if self.CONV_WIDTH == 1:
@ -100,6 +113,8 @@ class BaseRegressionModel(IFreqaiModel):

        pred_df = DataFrame(predictions, columns=dk.label_list)

-        pred_df = dk.denormalize_labels_from_metadata(pred_df)
+        pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df)
+        dk.DI_values = dk.label_pipeline.get_step("di").di_values
+        dk.do_predict = outliers.to_numpy()

        return (pred_df, dk.do_predict)
--- a/freqtrade/freqai/base_models/BaseTensorFlowModel.py
+++ b/freqtrade/freqai/base_models/BaseTensorFlowModel.py
@ -1,70 +0,0 @@
-import logging
-from time import time
-from typing import Any
-
-from pandas import DataFrame
-
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-from freqtrade.freqai.freqai_interface import IFreqaiModel
-
-
-logger = logging.getLogger(__name__)
-
-
-class BaseTensorFlowModel(IFreqaiModel):
-    """
-    Base class for TensorFlow type models.
-    User *must* inherit from this class and set fit() and predict().
-    """
-
-    def train(
-        self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
-    ) -> Any:
-        """
-        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
-        for storing, saving, loading, and analyzing the data.
-        :param unfiltered_df: Full dataframe for the current training period
-        :param metadata: pair metadata from strategy.
-        :return:
-        :model: Trained model which can be used to inference (self.predict)
-        """
-
-        logger.info(f"-------------------- Starting training {pair} --------------------")
-
-        start_time = time()
-
-        # filter the features requested by user in the configuration file and elegantly handle NaNs
-        features_filtered, labels_filtered = dk.filter_features(
-            unfiltered_df,
-            dk.training_features_list,
-            dk.label_list,
-            training_filter=True,
-        )
-
-        start_date = unfiltered_df["date"].iloc[0].strftime("%Y-%m-%d")
-        end_date = unfiltered_df["date"].iloc[-1].strftime("%Y-%m-%d")
-        logger.info(f"-------------------- Training on data from {start_date} to "
-                    f"{end_date} --------------------")
-        # split data into train/test data.
-        data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
-        if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:
-            dk.fit_labels()
-        # normalize all data based on train_dataset only
-        data_dictionary = dk.normalize_data(data_dictionary)
-
-        # optional additional data cleaning/analysis
-        self.data_cleaning_train(dk)
-
-        logger.info(
-            f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
-        )
-        logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
-
-        model = self.fit(data_dictionary, dk)
-
-        end_time = time()
-
-        logger.info(f"-------------------- Done training {pair} "
-                    f"({end_time - start_time:.2f} secs) --------------------")
-
-        return model
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@ -460,6 +460,13 @@ class FreqaiDataDrawer:
        with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp:
            rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)

+        # save the pipelines to pickle files
+        with (save_path / f"{dk.model_filename}_pipeline.pkl").open("wb") as fp:
+            cloudpickle.dump(dk.pipeline, fp)
+
+        with (save_path / f"{dk.model_filename}_label_pipeline.pkl").open("wb") as fp:
+            cloudpickle.dump(dk.label_pipeline, fp)
+
        # save the train data to file so we can check preds for area of applicability later
        dk.data_dictionary["train_features"].to_pickle(
            save_path / f"{dk.model_filename}_trained_df.pkl"
@ -482,6 +489,8 @@ class FreqaiDataDrawer:
            self.meta_data_dictionary[coin] = {}
        self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
        self.meta_data_dictionary[coin]["meta_data"] = dk.data
+        self.meta_data_dictionary[coin]["pipeline"] = dk.pipeline
+        self.meta_data_dictionary[coin]["label_pipeline"] = dk.label_pipeline
        self.save_drawer_to_disk()

        return
@ -513,6 +522,8 @@ class FreqaiDataDrawer:
        if coin in self.meta_data_dictionary:
            dk.data = self.meta_data_dictionary[coin]["meta_data"]
            dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"]
+            dk.pipeline = self.meta_data_dictionary[coin]["pipeline"]
+            dk.label_pipeline = self.meta_data_dictionary[coin]["label_pipeline"]
        else:
            with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp:
                dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE)
@ -520,6 +531,10 @@ class FreqaiDataDrawer:
            dk.data_dictionary["train_features"] = pd.read_pickle(
                dk.data_path / f"{dk.model_filename}_trained_df.pkl"
            )
+            with (dk.data_path / f"{dk.model_filename}_pipeline.pkl").open("rb") as fp:
+                dk.pipeline = cloudpickle.load(fp)
+            with (dk.data_path / f"{dk.model_filename}_label_pipeline.pkl").open("rb") as fp:
+                dk.label_pipeline = cloudpickle.load(fp)

        dk.training_features_list = dk.data["training_features_list"]
        dk.label_list = dk.data["label_list"]
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@ -27,6 +27,7 @@ from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
 from freqtrade.strategy import merge_informative_pair
 from freqtrade.strategy.interface import IStrategy
+from datasieve.pipeline import Pipeline


 SECONDS_IN_DAY = 86400
@ -86,6 +87,8 @@ class FreqaiDataKitchen:
        self.keras: bool = self.freqai_config.get("keras", False)
        self.set_all_pairs()
        self.backtest_live_models = config.get("freqai_backtest_live_models", False)
+        self.pipeline = Pipeline()
+        self.label_pipeline = Pipeline()

        if not self.live:
            self.full_path = self.get_full_models_path(self.config)
@ -307,106 +310,106 @@ class FreqaiDataKitchen:

        return self.data_dictionary

-    def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
-        """
-        Normalize all data in the data_dictionary according to the training dataset
-        :param data_dictionary: dictionary containing the cleaned and
-                                split training/test data/labels
-        :returns:
-        :data_dictionary: updated dictionary with standardized values.
-        """
+    # def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
+    #     """
+    #     Normalize all data in the data_dictionary according to the training dataset
+    #     :param data_dictionary: dictionary containing the cleaned and
+    #                             split training/test data/labels
+    #     :returns:
+    #     :data_dictionary: updated dictionary with standardized values.
+    #     """

-        # standardize the data by training stats
-        train_max = data_dictionary["train_features"].max()
-        train_min = data_dictionary["train_features"].min()
-        data_dictionary["train_features"] = (
-            2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1
-        )
-        data_dictionary["test_features"] = (
-            2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1
-        )
+    #     # standardize the data by training stats
+    #     train_max = data_dictionary["train_features"].max()
+    #     train_min = data_dictionary["train_features"].min()
+    #     data_dictionary["train_features"] = (
+    #         2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1
+    #     )
+    #     data_dictionary["test_features"] = (
+    #         2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1
+    #     )

-        for item in train_max.keys():
-            self.data[item + "_max"] = train_max[item]
-            self.data[item + "_min"] = train_min[item]
+    #     for item in train_max.keys():
+    #         self.data[item + "_max"] = train_max[item]
+    #         self.data[item + "_min"] = train_min[item]

-        for item in data_dictionary["train_labels"].keys():
-            if data_dictionary["train_labels"][item].dtype == object:
-                continue
-            train_labels_max = data_dictionary["train_labels"][item].max()
-            train_labels_min = data_dictionary["train_labels"][item].min()
-            data_dictionary["train_labels"][item] = (
-                2
-                * (data_dictionary["train_labels"][item] - train_labels_min)
-                / (train_labels_max - train_labels_min)
-                - 1
-            )
-            if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
-                data_dictionary["test_labels"][item] = (
-                    2
-                    * (data_dictionary["test_labels"][item] - train_labels_min)
-                    / (train_labels_max - train_labels_min)
-                    - 1
-                )
+    #     for item in data_dictionary["train_labels"].keys():
+    #         if data_dictionary["train_labels"][item].dtype == object:
+    #             continue
+    #         train_labels_max = data_dictionary["train_labels"][item].max()
+    #         train_labels_min = data_dictionary["train_labels"][item].min()
+    #         data_dictionary["train_labels"][item] = (
+    #             2
+    #             * (data_dictionary["train_labels"][item] - train_labels_min)
+    #             / (train_labels_max - train_labels_min)
+    #             - 1
+    #         )
+    #         if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
+    #             data_dictionary["test_labels"][item] = (
+    #                 2
+    #                 * (data_dictionary["test_labels"][item] - train_labels_min)
+    #                 / (train_labels_max - train_labels_min)
+    #                 - 1
+    #             )

-            self.data[f"{item}_max"] = train_labels_max
-            self.data[f"{item}_min"] = train_labels_min
-        return data_dictionary
+    #         self.data[f"{item}_max"] = train_labels_max
+    #         self.data[f"{item}_min"] = train_labels_min
+    #     return data_dictionary

-    def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
+    # def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:

-        train_max = df.max()
-        train_min = df.min()
-        df = (
-            2 * (df - train_min) / (train_max - train_min) - 1
-        )
+    #     train_max = df.max()
+    #     train_min = df.min()
+    #     df = (
+    #         2 * (df - train_min) / (train_max - train_min) - 1
+    #     )

-        for item in train_max.keys():
-            self.data[item + "_max"] = train_max[item]
-            self.data[item + "_min"] = train_min[item]
+    #     for item in train_max.keys():
+    #         self.data[item + "_max"] = train_max[item]
+    #         self.data[item + "_min"] = train_min[item]

-        return df
+    #     return df

-    def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
-        """
-        Normalize a set of data using the mean and standard deviation from
-        the associated training data.
-        :param df: Dataframe to be standardized
-        """
+    # def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
+    #     """
+    #     Normalize a set of data using the mean and standard deviation from
+    #     the associated training data.
+    #     :param df: Dataframe to be standardized
+    #     """

-        train_max = [None] * len(df.keys())
-        train_min = [None] * len(df.keys())
+    #     train_max = [None] * len(df.keys())
+    #     train_min = [None] * len(df.keys())

-        for i, item in enumerate(df.keys()):
-            train_max[i] = self.data[f"{item}_max"]
-            train_min[i] = self.data[f"{item}_min"]
+    #     for i, item in enumerate(df.keys()):
+    #         train_max[i] = self.data[f"{item}_max"]
+    #         train_min[i] = self.data[f"{item}_min"]

-        train_max_series = pd.Series(train_max, index=df.keys())
-        train_min_series = pd.Series(train_min, index=df.keys())
+    #     train_max_series = pd.Series(train_max, index=df.keys())
+    #     train_min_series = pd.Series(train_min, index=df.keys())

-        df = (
-            2 * (df - train_min_series) / (train_max_series - train_min_series) - 1
-        )
+    #     df = (
+    #         2 * (df - train_min_series) / (train_max_series - train_min_series) - 1
+    #     )

-        return df
+    #     return df

-    def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
-        """
-        Denormalize a set of data using the mean and standard deviation from
-        the associated training data.
-        :param df: Dataframe of predictions to be denormalized
-        """
+    # def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
+    #     """
+    #     Denormalize a set of data using the mean and standard deviation from
+    #     the associated training data.
+    #     :param df: Dataframe of predictions to be denormalized
+    #     """

-        for label in df.columns:
-            if df[label].dtype == object or label in self.unique_class_list:
-                continue
-            df[label] = (
-                (df[label] + 1)
-                * (self.data[f"{label}_max"] - self.data[f"{label}_min"])
-                / 2
-            ) + self.data[f"{label}_min"]
+    #     for label in df.columns:
+    #         if df[label].dtype == object or label in self.unique_class_list:
+    #             continue
+    #         df[label] = (
+    #             (df[label] + 1)
+    #             * (self.data[f"{label}_max"] - self.data[f"{label}_min"])
+    #             / 2
+    #         ) + self.data[f"{label}_min"]

-        return df
+    #     return df

    def split_timerange(
        self, tr: str, train_split: int = 28, bt_split: float = 7
@ -501,398 +504,398 @@ class FreqaiDataKitchen:

        return df_predictions

-    def principal_component_analysis(self) -> None:
-        """
-        Performs Principal Component Analysis on the data for dimensionality reduction
-        and outlier detection (see self.remove_outliers())
-        No parameters or returns, it acts on the data_dictionary held by the DataHandler.
-        """
+    # def principal_component_analysis(self) -> None:
+    #     """
+    #     Performs Principal Component Analysis on the data for dimensionality reduction
+    #     and outlier detection (see self.remove_outliers())
+    #     No parameters or returns, it acts on the data_dictionary held by the DataHandler.
+    #     """

-        from sklearn.decomposition import PCA  # avoid importing if we dont need it
+    #     from sklearn.decomposition import PCA  # avoid importing if we dont need it

-        pca = PCA(0.999)
-        pca = pca.fit(self.data_dictionary["train_features"])
-        n_keep_components = pca.n_components_
-        self.data["n_kept_components"] = n_keep_components
-        n_components = self.data_dictionary["train_features"].shape[1]
-        logger.info("reduced feature dimension by %s", n_components - n_keep_components)
-        logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_))
+    #     pca = PCA(0.999)
+    #     pca = pca.fit(self.data_dictionary["train_features"])
+    #     n_keep_components = pca.n_components_
+    #     self.data["n_kept_components"] = n_keep_components
+    #     n_components = self.data_dictionary["train_features"].shape[1]
+    #     logger.info("reduced feature dimension by %s", n_components - n_keep_components)
+    #     logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_))

-        train_components = pca.transform(self.data_dictionary["train_features"])
-        self.data_dictionary["train_features"] = pd.DataFrame(
-            data=train_components,
-            columns=["PC" + str(i) for i in range(0, n_keep_components)],
-            index=self.data_dictionary["train_features"].index,
-        )
-        # normalsing transformed training features
-        self.data_dictionary["train_features"] = self.normalize_single_dataframe(
-            self.data_dictionary["train_features"])
+    #     train_components = pca.transform(self.data_dictionary["train_features"])
+    #     self.data_dictionary["train_features"] = pd.DataFrame(
+    #         data=train_components,
+    #         columns=["PC" + str(i) for i in range(0, n_keep_components)],
+    #         index=self.data_dictionary["train_features"].index,
+    #     )
+    #     # normalsing transformed training features
+    #     self.data_dictionary["train_features"] = self.normalize_single_dataframe(
+    #         self.data_dictionary["train_features"])

-        # keeping a copy of the non-transformed features so we can check for errors during
-        # model load from disk
-        self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list)
-        self.training_features_list = self.data_dictionary["train_features"].columns
+    #     # keeping a copy of the non-transformed features so we can check for errors during
+    #     # model load from disk
+    #     self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list)
+    #     self.training_features_list = self.data_dictionary["train_features"].columns

-        if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
-            test_components = pca.transform(self.data_dictionary["test_features"])
-            self.data_dictionary["test_features"] = pd.DataFrame(
-                data=test_components,
-                columns=["PC" + str(i) for i in range(0, n_keep_components)],
-                index=self.data_dictionary["test_features"].index,
-            )
-            # normalise transformed test feature to transformed training features
-            self.data_dictionary["test_features"] = self.normalize_data_from_metadata(
-                self.data_dictionary["test_features"])
+    #     if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
+    #         test_components = pca.transform(self.data_dictionary["test_features"])
+    #         self.data_dictionary["test_features"] = pd.DataFrame(
+    #             data=test_components,
+    #             columns=["PC" + str(i) for i in range(0, n_keep_components)],
+    #             index=self.data_dictionary["test_features"].index,
+    #         )
+    #         # normalise transformed test feature to transformed training features
+    #         self.data_dictionary["test_features"] = self.normalize_data_from_metadata(
+    #             self.data_dictionary["test_features"])

-        self.data["n_kept_components"] = n_keep_components
-        self.pca = pca
+    #     self.data["n_kept_components"] = n_keep_components
+    #     self.pca = pca

-        logger.info(f"PCA reduced total features from  {n_components} to {n_keep_components}")
+    #     logger.info(f"PCA reduced total features from  {n_components} to {n_keep_components}")

-        if not self.data_path.is_dir():
-            self.data_path.mkdir(parents=True, exist_ok=True)
+    #     if not self.data_path.is_dir():
+    #         self.data_path.mkdir(parents=True, exist_ok=True)

-        return None
+    #     return None

-    def pca_transform(self, filtered_dataframe: DataFrame) -> None:
-        """
-        Use an existing pca transform to transform data into components
-        :param filtered_dataframe: DataFrame = the cleaned dataframe
-        """
-        pca_components = self.pca.transform(filtered_dataframe)
-        self.data_dictionary["prediction_features"] = pd.DataFrame(
-            data=pca_components,
-            columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
-            index=filtered_dataframe.index,
-        )
-        # normalise transformed predictions to transformed training features
-        self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata(
-            self.data_dictionary["prediction_features"])
+    # def pca_transform(self, filtered_dataframe: DataFrame) -> None:
+    #     """
+    #     Use an existing pca transform to transform data into components
+    #     :param filtered_dataframe: DataFrame = the cleaned dataframe
+    #     """
+    #     pca_components = self.pca.transform(filtered_dataframe)
+    #     self.data_dictionary["prediction_features"] = pd.DataFrame(
+    #         data=pca_components,
+    #         columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
+    #         index=filtered_dataframe.index,
+    #     )
+    #     # normalise transformed predictions to transformed training features
+    #     self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata(
+    #         self.data_dictionary["prediction_features"])

-    def compute_distances(self) -> float:
-        """
-        Compute distances between each training point and every other training
-        point. This metric defines the neighborhood of trained data and is used
-        for prediction confidence in the Dissimilarity Index
-        """
-        # logger.info("computing average mean distance for all training points")
-        pairwise = pairwise_distances(
-            self.data_dictionary["train_features"], n_jobs=self.thread_count)
-        # remove the diagonal distances which are itself distances ~0
-        np.fill_diagonal(pairwise, np.NaN)
-        pairwise = pairwise.reshape(-1, 1)
-        avg_mean_dist = pairwise[~np.isnan(pairwise)].mean()
+    # def compute_distances(self) -> float:
+    #     """
+    #     Compute distances between each training point and every other training
+    #     point. This metric defines the neighborhood of trained data and is used
+    #     for prediction confidence in the Dissimilarity Index
+    #     """
+    #     # logger.info("computing average mean distance for all training points")
+    #     pairwise = pairwise_distances(
+    #         self.data_dictionary["train_features"], n_jobs=self.thread_count)
+    #     # remove the diagonal distances which are itself distances ~0
+    #     np.fill_diagonal(pairwise, np.NaN)
+    #     pairwise = pairwise.reshape(-1, 1)
+    #     avg_mean_dist = pairwise[~np.isnan(pairwise)].mean()

-        return avg_mean_dist
+    #     return avg_mean_dist

-    def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float:
-        """
-        Check if more than X% of points werer dropped during outlier detection.
-        """
-        outlier_protection_pct = self.freqai_config["feature_parameters"].get(
-            "outlier_protection_percentage", 30)
-        outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100
-        if outlier_pct >= outlier_protection_pct:
-            return outlier_pct
-        else:
-            return 0.0
+    # def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float:
+    #     """
+    #     Check if more than X% of points werer dropped during outlier detection.
+    #     """
+    #     outlier_protection_pct = self.freqai_config["feature_parameters"].get(
+    #         "outlier_protection_percentage", 30)
+    #     outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100
+    #     if outlier_pct >= outlier_protection_pct:
+    #         return outlier_pct
+    #     else:
+    #         return 0.0

-    def use_SVM_to_remove_outliers(self, predict: bool) -> None:
-        """
-        Build/inference a Support Vector Machine to detect outliers
-        in training data and prediction
-        :param predict: bool = If true, inference an existing SVM model, else construct one
-        """
+    # def use_SVM_to_remove_outliers(self, predict: bool) -> None:
+    #     """
+    #     Build/inference a Support Vector Machine to detect outliers
+    #     in training data and prediction
+    #     :param predict: bool = If true, inference an existing SVM model, else construct one
+    #     """

-        if self.keras:
-            logger.warning(
-                "SVM outlier removal not currently supported for Keras based models. "
-                "Skipping user requested function."
-            )
-            if predict:
-                self.do_predict = np.ones(len(self.data_dictionary["prediction_features"]))
-            return
+    #     if self.keras:
+    #         logger.warning(
+    #             "SVM outlier removal not currently supported for Keras based models. "
+    #             "Skipping user requested function."
+    #         )
+    #         if predict:
+    #             self.do_predict = np.ones(len(self.data_dictionary["prediction_features"]))
+    #         return

-        if predict:
-            if not self.svm_model:
-                logger.warning("No svm model available for outlier removal")
-                return
-            y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"])
-            do_predict = np.where(y_pred == -1, 0, y_pred)
+    #     if predict:
+    #         if not self.svm_model:
+    #             logger.warning("No svm model available for outlier removal")
+    #             return
+    #         y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"])
+    #         do_predict = np.where(y_pred == -1, 0, y_pred)

-            if (len(do_predict) - do_predict.sum()) > 0:
-                logger.info(f"SVM tossed {len(do_predict) - do_predict.sum()} predictions.")
-            self.do_predict += do_predict
-            self.do_predict -= 1
+    #         if (len(do_predict) - do_predict.sum()) > 0:
+    #             logger.info(f"SVM tossed {len(do_predict) - do_predict.sum()} predictions.")
+    #         self.do_predict += do_predict
+    #         self.do_predict -= 1

-        else:
-            # use SGDOneClassSVM to increase speed?
-            svm_params = self.freqai_config["feature_parameters"].get(
-                "svm_params", {"shuffle": False, "nu": 0.1})
-            self.svm_model = linear_model.SGDOneClassSVM(**svm_params).fit(
-                self.data_dictionary["train_features"]
-            )
-            y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
-            kept_points = np.where(y_pred == -1, 0, y_pred)
-            # keep_index = np.where(y_pred == 1)
-            outlier_pct = self.get_outlier_percentage(1 - kept_points)
-            if outlier_pct:
-                logger.warning(
-                        f"SVM detected {outlier_pct:.2f}% of the points as outliers. "
-                        f"Keeping original dataset."
-                )
-                self.svm_model = None
-                return
+    #     else:
+    #         # use SGDOneClassSVM to increase speed?
+    #         svm_params = self.freqai_config["feature_parameters"].get(
+    #             "svm_params", {"shuffle": False, "nu": 0.1})
+    #         self.svm_model = linear_model.SGDOneClassSVM(**svm_params).fit(
+    #             self.data_dictionary["train_features"]
+    #         )
+    #         y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
+    #         kept_points = np.where(y_pred == -1, 0, y_pred)
+    #         # keep_index = np.where(y_pred == 1)
+    #         outlier_pct = self.get_outlier_percentage(1 - kept_points)
+    #         if outlier_pct:
+    #             logger.warning(
+    #                     f"SVM detected {outlier_pct:.2f}% of the points as outliers. "
+    #                     f"Keeping original dataset."
+    #             )
+    #             self.svm_model = None
+    #             return

-            self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
-                (y_pred == 1)
-            ]
-            self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
-                (y_pred == 1)
-            ]
-            self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
-                (y_pred == 1)
-            ]
+    #         self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
+    #             (y_pred == 1)
+    #         ]
+    #         self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
+    #             (y_pred == 1)
+    #         ]
+    #         self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
+    #             (y_pred == 1)
+    #         ]

-            logger.info(
-                f"SVM tossed {len(y_pred) - kept_points.sum()}"
-                f" train points from {len(y_pred)} total points."
-            )
+    #         logger.info(
+    #             f"SVM tossed {len(y_pred) - kept_points.sum()}"
+    #             f" train points from {len(y_pred)} total points."
+    #         )

-            # same for test data
-            # TODO: This (and the part above) could be refactored into a separate function
-            # to reduce code duplication
-            if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
-                y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
-                kept_points = np.where(y_pred == -1, 0, y_pred)
-                self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
-                    (y_pred == 1)
-                ]
-                self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][(
-                    y_pred == 1)]
-                self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
-                    (y_pred == 1)
-                ]
+    #         # same for test data
+    #         # TODO: This (and the part above) could be refactored into a separate function
+    #         # to reduce code duplication
+    #         if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
+    #             y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
+    #             kept_points = np.where(y_pred == -1, 0, y_pred)
+    #             self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
+    #                 (y_pred == 1)
+    #             ]
+    #             self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][(
+    #                 y_pred == 1)]
+    #             self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
+    #                 (y_pred == 1)
+    #             ]

-            logger.info(
-                f"{self.pair}: SVM tossed {len(y_pred) - kept_points.sum()}"
-                f" test points from {len(y_pred)} total points."
-            )
+    #         logger.info(
+    #             f"{self.pair}: SVM tossed {len(y_pred) - kept_points.sum()}"
+    #             f" test points from {len(y_pred)} total points."
+    #         )

-        return
+    #     return

-    def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
-        """
-        Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
-        User controls this via the config param `DBSCAN_outlier_pct` which indicates the
-        pct of training data that they want to be considered outliers.
-        :param predict: bool = If False (training), iterate to find the best hyper parameters
-                        to match user requested outlier percent target.
-                        If True (prediction), use the parameters determined from
-                        the previous training to estimate if the current prediction point
-                        is an outlier.
-        """
+    # def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
+    #     """
+    #     Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
+    #     User controls this via the config param `DBSCAN_outlier_pct` which indicates the
+    #     pct of training data that they want to be considered outliers.
+    #     :param predict: bool = If False (training), iterate to find the best hyper parameters
+    #                     to match user requested outlier percent target.
+    #                     If True (prediction), use the parameters determined from
+    #                     the previous training to estimate if the current prediction point
+    #                     is an outlier.
+    #     """

-        if predict:
-            if not self.data['DBSCAN_eps']:
-                return
-            train_ft_df = self.data_dictionary['train_features']
-            pred_ft_df = self.data_dictionary['prediction_features']
-            num_preds = len(pred_ft_df)
-            df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
-            clustering = DBSCAN(eps=self.data['DBSCAN_eps'],
-                                min_samples=self.data['DBSCAN_min_samples'],
-                                n_jobs=self.thread_count
-                                ).fit(df)
-            do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1)
+    #     if predict:
+    #         if not self.data['DBSCAN_eps']:
+    #             return
+    #         train_ft_df = self.data_dictionary['train_features']
+    #         pred_ft_df = self.data_dictionary['prediction_features']
+    #         num_preds = len(pred_ft_df)
+    #         df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
+    #         clustering = DBSCAN(eps=self.data['DBSCAN_eps'],
+    #                             min_samples=self.data['DBSCAN_min_samples'],
+    #                             n_jobs=self.thread_count
+    #                             ).fit(df)
+    #         do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1)

-            if (len(do_predict) - do_predict.sum()) > 0:
-                logger.info(f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions")
-            self.do_predict += do_predict
-            self.do_predict -= 1
+    #         if (len(do_predict) - do_predict.sum()) > 0:
+    #             logger.info(f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions")
+    #         self.do_predict += do_predict
+    #         self.do_predict -= 1

-        else:
+    #     else:

-            def normalise_distances(distances):
-                normalised_distances = (distances - distances.min()) / \
-                                        (distances.max() - distances.min())
-                return normalised_distances
+    #         def normalise_distances(distances):
+    #             normalised_distances = (distances - distances.min()) / \
+    #                                     (distances.max() - distances.min())
+    #             return normalised_distances

-            def rotate_point(origin, point, angle):
-                # rotate a point counterclockwise by a given angle (in radians)
-                # around a given origin
-                x = origin[0] + cos(angle) * (point[0] - origin[0]) - \
-                                    sin(angle) * (point[1] - origin[1])
-                y = origin[1] + sin(angle) * (point[0] - origin[0]) + \
-                    cos(angle) * (point[1] - origin[1])
-                return (x, y)
+    #         def rotate_point(origin, point, angle):
+    #             # rotate a point counterclockwise by a given angle (in radians)
+    #             # around a given origin
+    #             x = origin[0] + cos(angle) * (point[0] - origin[0]) - \
+    #                                 sin(angle) * (point[1] - origin[1])
+    #             y = origin[1] + sin(angle) * (point[0] - origin[0]) + \
+    #                 cos(angle) * (point[1] - origin[1])
+    #             return (x, y)

-            MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25)
-            # measure pairwise distances to nearest neighbours
-            neighbors = NearestNeighbors(
-                n_neighbors=MinPts, n_jobs=self.thread_count)
-            neighbors_fit = neighbors.fit(self.data_dictionary['train_features'])
-            distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features'])
-            distances = np.sort(distances, axis=0).mean(axis=1)
+    #         MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25)
+    #         # measure pairwise distances to nearest neighbours
+    #         neighbors = NearestNeighbors(
+    #             n_neighbors=MinPts, n_jobs=self.thread_count)
+    #         neighbors_fit = neighbors.fit(self.data_dictionary['train_features'])
+    #         distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features'])
+    #         distances = np.sort(distances, axis=0).mean(axis=1)

-            normalised_distances = normalise_distances(distances)
-            x_range = np.linspace(0, 1, len(distances))
-            line = np.linspace(normalised_distances[0],
-                               normalised_distances[-1], len(normalised_distances))
-            deflection = np.abs(normalised_distances - line)
-            max_deflection_loc = np.where(deflection == deflection.max())[0][0]
-            origin = x_range[max_deflection_loc], line[max_deflection_loc]
-            point = x_range[max_deflection_loc], normalised_distances[max_deflection_loc]
-            rot_angle = np.pi / 4
-            elbow_loc = rotate_point(origin, point, rot_angle)
+    #         normalised_distances = normalise_distances(distances)
+    #         x_range = np.linspace(0, 1, len(distances))
+    #         line = np.linspace(normalised_distances[0],
+    #                            normalised_distances[-1], len(normalised_distances))
+    #         deflection = np.abs(normalised_distances - line)
+    #         max_deflection_loc = np.where(deflection == deflection.max())[0][0]
+    #         origin = x_range[max_deflection_loc], line[max_deflection_loc]
+    #         point = x_range[max_deflection_loc], normalised_distances[max_deflection_loc]
+    #         rot_angle = np.pi / 4
+    #         elbow_loc = rotate_point(origin, point, rot_angle)

-            epsilon = elbow_loc[1] * (distances[-1] - distances[0]) + distances[0]
+    #         epsilon = elbow_loc[1] * (distances[-1] - distances[0]) + distances[0]

-            clustering = DBSCAN(eps=epsilon, min_samples=MinPts,
-                                n_jobs=int(self.thread_count)).fit(
-                                                    self.data_dictionary['train_features']
-                                                )
+    #         clustering = DBSCAN(eps=epsilon, min_samples=MinPts,
+    #                             n_jobs=int(self.thread_count)).fit(
+    #                                                 self.data_dictionary['train_features']
+    #                                             )

-            logger.info(f'DBSCAN found eps of {epsilon:.2f}.')
+    #         logger.info(f'DBSCAN found eps of {epsilon:.2f}.')

-            self.data['DBSCAN_eps'] = epsilon
-            self.data['DBSCAN_min_samples'] = MinPts
-            dropped_points = np.where(clustering.labels_ == -1, 1, 0)
+    #         self.data['DBSCAN_eps'] = epsilon
+    #         self.data['DBSCAN_min_samples'] = MinPts
+    #         dropped_points = np.where(clustering.labels_ == -1, 1, 0)

-            outlier_pct = self.get_outlier_percentage(dropped_points)
-            if outlier_pct:
-                logger.warning(
-                        f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. "
-                        f"Keeping original dataset."
-                )
-                self.data['DBSCAN_eps'] = 0
-                return
+    #         outlier_pct = self.get_outlier_percentage(dropped_points)
+    #         if outlier_pct:
+    #             logger.warning(
+    #                     f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. "
+    #                     f"Keeping original dataset."
+    #             )
+    #             self.data['DBSCAN_eps'] = 0
+    #             return

-            self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
-                (clustering.labels_ != -1)
-            ]
-            self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
-                (clustering.labels_ != -1)
-            ]
-            self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
-                (clustering.labels_ != -1)
-            ]
+    #         self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
+    #             (clustering.labels_ != -1)
+    #         ]
+    #         self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
+    #             (clustering.labels_ != -1)
+    #         ]
+    #         self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
+    #             (clustering.labels_ != -1)
+    #         ]

-            logger.info(
-                f"DBSCAN tossed {dropped_points.sum()}"
-                f" train points from {len(clustering.labels_)}"
-            )
+    #         logger.info(
+    #             f"DBSCAN tossed {dropped_points.sum()}"
+    #             f" train points from {len(clustering.labels_)}"
+    #         )

-        return
+    #     return

-    def compute_inlier_metric(self, set_='train') -> None:
-        """
-        Compute inlier metric from backwards distance distributions.
-        This metric defines how well features from a timepoint fit
-        into previous timepoints.
-        """
+    # def compute_inlier_metric(self, set_='train') -> None:
+    #     """
+    #     Compute inlier metric from backwards distance distributions.
+    #     This metric defines how well features from a timepoint fit
+    #     into previous timepoints.
+    #     """

-        def normalise(dataframe: DataFrame, key: str) -> DataFrame:
-            if set_ == 'train':
-                min_value = dataframe.min()
-                max_value = dataframe.max()
-                self.data[f'{key}_min'] = min_value
-                self.data[f'{key}_max'] = max_value
-            else:
-                min_value = self.data[f'{key}_min']
-                max_value = self.data[f'{key}_max']
-            return (dataframe - min_value) / (max_value - min_value)
+    #     def normalise(dataframe: DataFrame, key: str) -> DataFrame:
+    #         if set_ == 'train':
+    #             min_value = dataframe.min()
+    #             max_value = dataframe.max()
+    #             self.data[f'{key}_min'] = min_value
+    #             self.data[f'{key}_max'] = max_value
+    #         else:
+    #             min_value = self.data[f'{key}_min']
+    #             max_value = self.data[f'{key}_max']
+    #         return (dataframe - min_value) / (max_value - min_value)

-        no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
+    #     no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]

-        if set_ == 'train':
-            compute_df = copy.deepcopy(self.data_dictionary['train_features'])
-        elif set_ == 'test':
-            compute_df = copy.deepcopy(self.data_dictionary['test_features'])
-        else:
-            compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])
+    #     if set_ == 'train':
+    #         compute_df = copy.deepcopy(self.data_dictionary['train_features'])
+    #     elif set_ == 'test':
+    #         compute_df = copy.deepcopy(self.data_dictionary['test_features'])
+    #     else:
+    #         compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])

-        compute_df_reindexed = compute_df.reindex(
-            index=np.flip(compute_df.index)
-        )
+    #     compute_df_reindexed = compute_df.reindex(
+    #         index=np.flip(compute_df.index)
+    #     )

-        pairwise = pd.DataFrame(
-            np.triu(
-                pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
-            ),
-            columns=compute_df_reindexed.index,
-            index=compute_df_reindexed.index
-        )
-        pairwise = pairwise.round(5)
+    #     pairwise = pd.DataFrame(
+    #         np.triu(
+    #             pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
+    #         ),
+    #         columns=compute_df_reindexed.index,
+    #         index=compute_df_reindexed.index
+    #     )
+    #     pairwise = pairwise.round(5)

-        column_labels = [
-            '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
-        ]
-        distances = pd.DataFrame(
-            columns=column_labels, index=compute_df.index
-        )
+    #     column_labels = [
+    #         '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
+    #     ]
+    #     distances = pd.DataFrame(
+    #         columns=column_labels, index=compute_df.index
+    #     )

-        for index in compute_df.index[no_prev_pts:]:
-            current_row = pairwise.loc[[index]]
-            current_row_no_zeros = current_row.loc[
-                :, (current_row != 0).any(axis=0)
-            ]
-            distances.loc[[index]] = current_row_no_zeros.iloc[
-                :, :no_prev_pts
-            ]
-        distances = distances.replace([np.inf, -np.inf], np.nan)
-        drop_index = pd.isnull(distances).any(axis=1)
-        distances = distances[drop_index == 0]
+    #     for index in compute_df.index[no_prev_pts:]:
+    #         current_row = pairwise.loc[[index]]
+    #         current_row_no_zeros = current_row.loc[
+    #             :, (current_row != 0).any(axis=0)
+    #         ]
+    #         distances.loc[[index]] = current_row_no_zeros.iloc[
+    #             :, :no_prev_pts
+    #         ]
+    #     distances = distances.replace([np.inf, -np.inf], np.nan)
+    #     drop_index = pd.isnull(distances).any(axis=1)
+    #     distances = distances[drop_index == 0]

-        inliers = pd.DataFrame(index=distances.index)
-        for key in distances.keys():
-            current_distances = distances[key].dropna()
-            current_distances = normalise(current_distances, key)
-            if set_ == 'train':
-                fit_params = stats.weibull_min.fit(current_distances)
-                self.data[f'{key}_fit_params'] = fit_params
-            else:
-                fit_params = self.data[f'{key}_fit_params']
-            quantiles = stats.weibull_min.cdf(current_distances, *fit_params)
+    #     inliers = pd.DataFrame(index=distances.index)
+    #     for key in distances.keys():
+    #         current_distances = distances[key].dropna()
+    #         current_distances = normalise(current_distances, key)
+    #         if set_ == 'train':
+    #             fit_params = stats.weibull_min.fit(current_distances)
+    #             self.data[f'{key}_fit_params'] = fit_params
+    #         else:
+    #             fit_params = self.data[f'{key}_fit_params']
+    #         quantiles = stats.weibull_min.cdf(current_distances, *fit_params)

-            df_inlier = pd.DataFrame(
-                {key: quantiles}, index=distances.index
-            )
-            inliers = pd.concat(
-                [inliers, df_inlier], axis=1
-            )
+    #         df_inlier = pd.DataFrame(
+    #             {key: quantiles}, index=distances.index
+    #         )
+    #         inliers = pd.concat(
+    #             [inliers, df_inlier], axis=1
+    #         )

-        inlier_metric = pd.DataFrame(
-            data=inliers.sum(axis=1) / no_prev_pts,
-            columns=['%-inlier_metric'],
-            index=compute_df.index
-        )
+    #     inlier_metric = pd.DataFrame(
+    #         data=inliers.sum(axis=1) / no_prev_pts,
+    #         columns=['%-inlier_metric'],
+    #         index=compute_df.index
+    #     )

-        inlier_metric = (2 * (inlier_metric - inlier_metric.min()) /
-                         (inlier_metric.max() - inlier_metric.min()) - 1)
+    #     inlier_metric = (2 * (inlier_metric - inlier_metric.min()) /
+    #                      (inlier_metric.max() - inlier_metric.min()) - 1)

-        if set_ in ('train', 'test'):
-            inlier_metric = inlier_metric.iloc[no_prev_pts:]
-            compute_df = compute_df.iloc[no_prev_pts:]
-            self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
-            self.data_dictionary[f'{set_}_features'] = pd.concat(
-                [compute_df, inlier_metric], axis=1)
-        else:
-            self.data_dictionary['prediction_features'] = pd.concat(
-                [compute_df, inlier_metric], axis=1)
-            self.data_dictionary['prediction_features'].fillna(0, inplace=True)
+    #     if set_ in ('train', 'test'):
+    #         inlier_metric = inlier_metric.iloc[no_prev_pts:]
+    #         compute_df = compute_df.iloc[no_prev_pts:]
+    #         self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
+    #         self.data_dictionary[f'{set_}_features'] = pd.concat(
+    #             [compute_df, inlier_metric], axis=1)
+    #     else:
+    #         self.data_dictionary['prediction_features'] = pd.concat(
+    #             [compute_df, inlier_metric], axis=1)
+    #         self.data_dictionary['prediction_features'].fillna(0, inplace=True)

-        logger.info('Inlier metric computed and added to features.')
+    #     logger.info('Inlier metric computed and added to features.')

-        return None
+    #     return None

-    def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
-        features = self.data_dictionary[f'{set_}_features']
-        weights = self.data_dictionary[f'{set_}_weights']
-        labels = self.data_dictionary[f'{set_}_labels']
-        self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
-        self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
-        self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
+    # def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
+    #     features = self.data_dictionary[f'{set_}_features']
+    #     weights = self.data_dictionary[f'{set_}_weights']
+    #     labels = self.data_dictionary[f'{set_}_labels']
+    #     self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
+    #     self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
+    #     self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]

    def add_noise_to_training_features(self) -> None:
        """
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@ -23,6 +23,8 @@ from freqtrade.freqai.data_drawer import FreqaiDataDrawer
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.utils import get_tb_logger, plot_feature_importance, record_params
 from freqtrade.strategy.interface import IStrategy
+from datasieve.pipeline import Pipeline
+import datasieve.transforms as ds


 pd.options.mode.chained_assignment = None
@ -566,6 +568,32 @@ class IFreqaiModel(ABC):
        if ft_params.get("use_DBSCAN_to_remove_outliers", False):
            dk.use_DBSCAN_to_remove_outliers(predict=True)

+    def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None:
+        ft_params = self.freqai_info["feature_parameters"]
+        dk.pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))])
+
+        if ft_params.get("principal_component_analysis", False):
+            dk.pipeline.steps += [('pca', ds.DataSievePCA())]
+            dk.pipeline.steps += [('post-pca-scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]
+
+        if ft_params.get("use_SVM_to_remove_outliers", False):
+            dk.pipeline.steps += [('svm', ds.SVMOutlierExtractor())]
+
+        if ft_params.get("DI_threshold", 0):
+            dk.pipeline.steps += [('di', ds.DissimilarityIndex())]
+
+        if ft_params.get("use_DBSCAN_to_remove_outliers", False):
+            dk.pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())]
+
+        dk.pipeline.fitparams = dk.pipeline._validate_fitparams({}, dk.pipeline.steps)
+
+        # if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
+        #     dk.pipeline.extend(('noise', ds.Noise()))
+
+    def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None:
+
+        dk.label_pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))])
+
    def model_exists(self, dk: FreqaiDataKitchen) -> bool:
        """
        Given a pair and path, check if a model already exists
--- a/requirements-freqai.txt
+++ b/requirements-freqai.txt
@ -10,3 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py
 lightgbm==3.3.5
 xgboost==1.7.5
 tensorboard==2.13.0
+datasieve==0.0.5
--- a/tests/freqai/test_freqai_datakitchen.py
+++ b/tests/freqai/test_freqai_datakitchen.py
@ -9,9 +9,9 @@ from freqtrade.configuration import TimeRange
 from freqtrade.data.dataprovider import DataProvider
 from freqtrade.exceptions import OperationalException
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-from tests.conftest import get_patched_exchange, log_has_re
+from tests.conftest import get_patched_exchange  # , log_has_re
 from tests.freqai.conftest import (get_patched_data_kitchen, get_patched_freqai_strategy,
-                                   make_data_dictionary, make_unfiltered_dataframe)
+                                   make_unfiltered_dataframe)  # make_data_dictionary,
 from tests.freqai.test_freqai_interface import is_mac


@ -72,66 +72,66 @@ def test_check_if_model_expired(mocker, freqai_conf):
    shutil.rmtree(Path(dk.full_path))


-def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    # freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1})
-    freqai.dk.use_DBSCAN_to_remove_outliers(predict=False)
-    assert log_has_re(r"DBSCAN found eps of 1\.7\d\.", caplog)
+# def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     # freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1})
+#     freqai.dk.use_DBSCAN_to_remove_outliers(predict=False)
+#     assert log_has_re(r"DBSCAN found eps of 1\.7\d\.", caplog)


-def test_compute_distances(mocker, freqai_conf):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1})
-    avg_mean_dist = freqai.dk.compute_distances()
-    assert round(avg_mean_dist, 2) == 1.98
+# def test_compute_distances(mocker, freqai_conf):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1})
+#     avg_mean_dist = freqai.dk.compute_distances()
+#     assert round(avg_mean_dist, 2) == 1.98


-def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1})
-    freqai.dk.use_SVM_to_remove_outliers(predict=False)
-    assert log_has_re(
-        "SVM detected 7.83%",
-        caplog,
-    )
+# def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1})
+#     freqai.dk.use_SVM_to_remove_outliers(predict=False)
+#     assert log_has_re(
+#         "SVM detected 7.83%",
+#         caplog,
+#     )


-def test_compute_inlier_metric(mocker, freqai_conf, caplog):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10})
-    freqai.dk.compute_inlier_metric(set_='train')
-    assert log_has_re(
-        "Inlier metric computed and added to features.",
-        caplog,
-    )
+# def test_compute_inlier_metric(mocker, freqai_conf, caplog):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10})
+#     freqai.dk.compute_inlier_metric(set_='train')
+#     assert log_has_re(
+#         "Inlier metric computed and added to features.",
+#         caplog,
+#     )


-def test_add_noise_to_training_features(mocker, freqai_conf):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1})
-    freqai.dk.add_noise_to_training_features()
+# def test_add_noise_to_training_features(mocker, freqai_conf):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1})
+#     freqai.dk.add_noise_to_training_features()


-def test_remove_beginning_points_from_data_dict(mocker, freqai_conf):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    freqai.dk.remove_beginning_points_from_data_dict(set_='train')
+# def test_remove_beginning_points_from_data_dict(mocker, freqai_conf):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     freqai.dk.remove_beginning_points_from_data_dict(set_='train')


-def test_principal_component_analysis(mocker, freqai_conf, caplog):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    freqai.dk.principal_component_analysis()
-    assert log_has_re(
-        "reduced feature dimension by",
-        caplog,
-    )
+# def test_principal_component_analysis(mocker, freqai_conf, caplog):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     freqai.dk.principal_component_analysis()
+#     assert log_has_re(
+#         "reduced feature dimension by",
+#         caplog,
+#     )


-def test_normalize_data(mocker, freqai_conf):
-    freqai = make_data_dictionary(mocker, freqai_conf)
-    data_dict = freqai.dk.data_dictionary
-    freqai.dk.normalize_data(data_dict)
-    assert any('_max' in entry for entry in freqai.dk.data.keys())
-    assert any('_min' in entry for entry in freqai.dk.data.keys())
+# def test_normalize_data(mocker, freqai_conf):
+#     freqai = make_data_dictionary(mocker, freqai_conf)
+#     data_dict = freqai.dk.data_dictionary
+#     freqai.dk.normalize_data(data_dict)
+#     assert any('_max' in entry for entry in freqai.dk.data.keys())
+#     assert any('_min' in entry for entry in freqai.dk.data.keys())


 def test_filter_features(mocker, freqai_conf):