From 599c1c79fb8170a7ea4bf9d250a4a3db0a3234ba Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Sat, 3 Sep 2022 14:00:01 +0200
Subject: [PATCH] reorganized backtest utilities, test new functionality,
 improve/update doc

---
 config_examples/config_freqai.example.json   |  2 -
 docs/freqai.md                               | 32 +++++++------
 freqtrade/freqai/data_kitchen.py             | 27 ++++++++++-
 freqtrade/freqai/freqai_interface.py         | 50 +++++---------------
 freqtrade/templates/FreqaiExampleStrategy.py |  2 +-
 tests/freqai/test_freqai_interface.py        |  9 +++-
 6 files changed, 63 insertions(+), 59 deletions(-)
diff --git a/config_examples/config_freqai.example.json b/config_examples/config_freqai.example.json
index 846d37a82..12eb30128 100644
--- a/config_examples/config_freqai.example.json
+++ b/config_examples/config_freqai.example.json
@@ -56,7 +56,6 @@
         "purge_old_models": true,
         "train_period_days": 15,
         "backtest_period_days": 7,
-        "backtest_save_model": true,
         "live_retrain_hours": 0,
         "identifier": "uniqe-id",
         "feature_parameters": {
@@ -75,7 +74,6 @@
             "weight_factor": 0.9,
             "principal_component_analysis": false,
             "use_SVM_to_remove_outliers": true,
-            "indicator_max_period_candles": 20,
             "indicator_periods_candles": [
                 10,
                 20
diff --git a/docs/freqai.md b/docs/freqai.md
index 6ee124b9b..3646362c3 100644
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -89,11 +89,10 @@ Mandatory parameters are marked as **Required**, which means that they are requi
 |------------|-------------|
 |  |  **General configuration parameters**
 | `freqai` | **Required.** <br> The parent dictionary containing all the parameters for controlling FreqAI. <br> **Datatype:** Dictionary.
-| `startup_candles` | Number of candles needed for *backtesting only* to ensure all indicators are non NaNs at the start of the first train period. <br> **Datatype:** Positive integer.
 | `purge_old_models` | Delete obsolete models (otherwise, all historic models will remain on disk). <br> **Datatype:** Boolean. Default: `False`.
 | `train_period_days` | **Required.** <br> Number of days to use for the training data (width of the sliding window). <br> **Datatype:** Positive integer.
 | `backtest_period_days` | **Required.** <br> Number of days to inference from the trained model before sliding the window defined above, and retraining the model. This can be fractional days, but beware that the user-provided `timerange` will be divided by this number to yield the number of trainings necessary to complete the backtest. <br> **Datatype:** Float.
-| `backtest_save_model` | Saves models to disk when running backtesting. <br> **Datatype:** Boolean. Default: `True`.
+| `save_backtest_models` | Backtesting operates most efficiently by saving the prediction data and reusing them directly for subsequent runs (when users wish to tune entry/exit parameters). If a user wishes to save models to disk when running backtesting, they should activate `save_backtest_models`. A user may wish to do this if they plan to use the same model files for starting a dry/live instance with the same `identifier`. <br> **Datatype:** Boolean. Default: `False`.
 | `identifier` | **Required.** <br> A unique name for the current model. This can be reused to reload pre-trained models/data. <br> **Datatype:** String.
 | `live_retrain_hours` | Frequency of retraining during dry/live runs. <br> Default set to 0, which means the model will retrain as often as possible. <br> **Datatype:** Float > 0.
 | `expiration_hours` | Avoid making predictions if a model is more than `expiration_hours` old. <br> Defaults set to 0, which means models never expire. <br> **Datatype:** Positive integer.
@@ -280,6 +279,17 @@ The FreqAI strategy requires the user to include the following lines of code in
 
 Notice how the `populate_any_indicators()` is where the user adds their own features ([more information](#feature-engineering)) and labels ([more information](#setting-classifier-targets)). See a full example at `templates/FreqaiExampleStrategy.py`.
 
+### Setting the `startup_candle_count`
+Users need to take care to set the `startup_candle_count` in their strategy the same way they would for any normal Freqtrade strategy (see details [here](strategy-customization.md/#strategy-startup-period)). This value is used by Freqtrade to ensure that a sufficient amount of data is provided when calling on the `dataprovider` to avoid any NaNs at the beginning of the first training. Users can easily set this value by identifying the longest period (in candle units) that they pass to their indicator creation functions (e.g. talib functions). In the present example, the user would pass 20 to as this value (since it is the maximum value in their `indicators_periods_candles`).
+
+!!! Note
+    Typically it is best for users to be safe and multiply their expected `startup_candle_count` by 2. There are instances where the talib functions actually require more data than just the passed `period`. Anecdotally, multiplying the `startup_candle_count` by 2 always leads to a fully NaN free training dataset. Look out for this log message to confirm that your data is clean:
+
+    ```
+    2022-08-31 15:14:04 - freqtrade.freqai.data_kitchen - INFO - dropped 0 training points due to NaNs in populated dataset 4319.
+    ```
+
+
 ## Creating a dynamic target
 
 The `&*_std/mean` return values describe the statistical fit of the user defined label *during the most recent training*. This value allows the user to know the rarity of a given prediction. For example, `templates/FreqaiExampleStrategy.py`, creates a `target_roi` which is based on filtering out predictions that are below a given z-score of 1.25.
@@ -505,7 +515,7 @@ and if a full `live_retrain_hours` has elapsed since the end of the loaded model
 The FreqAI backtesting module can be executed with the following command:
 
 ```bash
-freqtrade backtesting --strategy FreqaiExampleStrategy --config config_freqai.example.json --freqaimodel LightGBMRegressor --timerange 20210501-20210701
+freqtrade backtesting --strategy FreqaiExampleStrategy --config config_examples/config_freqai.example.json --freqaimodel LightGBMRegressor --timerange 20210501-20210701
 ```
 
 Backtesting mode requires the user to have the data pre-downloaded (unlike in dry/live mode where FreqAI automatically downloads the necessary data). The user should be careful to consider that the time range of the downloaded data is more than the backtesting time range. This is because FreqAI needs data prior to the desired backtesting time range in order to train a model to be ready to make predictions on the first candle of the user-set backtesting time range. More details on how to calculate the data to download can be found [here](#deciding-the-sliding-training-window-and-backtesting-duration).
@@ -532,20 +542,14 @@ the user is asking FreqAI to use a training period of 30 days and backtest on th
 This means that if the user sets `--timerange 20210501-20210701`,
 FreqAI will train have trained 8 separate models at the end of `--timerange` (because the full range comprises 8 weeks). After the training of the model, FreqAI will backtest the subsequent 7 days. The "sliding window" then moves one week forward (emulating FreqAI retraining once per week in live mode) and the new model uses the previous 30 days (including the 7 days used for backtesting by the previous model) to train. This is repeated until the end of `--timerange`.
 
-In live mode, the required training data is automatically computed and downloaded. However, in backtesting mode,
-the user must manually enter the required number of `startup_candles` in the config. This value
-is used to increase the data to FreqAI, which should be sufficient to enable all indicators
-to be NaN free at the beginning of the first training. This is done by identifying the
-longest timeframe (`4h` in presented example config) and the longest indicator period (`20` days in presented example config)
-and adding this to the `train_period_days`. The units need to be in the base candle time frame:
-`startup_candles` = ( 4 hours * 20 max period * 60 minutes/hour + 30 day train_period_days * 1440 minutes per day ) / 5 min (base time frame) = 9360.
-
-!!! Note
-    In dry/live mode, this is all precomputed and handled automatically. Thus, `startup_candle` has no influence on dry/live mode.
-
 !!! Note
     Although fractional `backtest_period_days` is allowed, the user should be aware that the `--timerange` is divided by this value to determine the number of models that FreqAI will need to train in order to backtest the full range. For example, if the user wants to set a `--timerange` of 10 days, and asks for a `backtest_period_days` of 0.1, FreqAI will need to train 100 models per pair to complete the full backtest. Because of this, a true backtest of FreqAI adaptive training would take a *very* long time. The best way to fully test a model is to run it dry and let it constantly train. In this case, backtesting would take the exact same amount of time as a dry run.
 
+### Downloading data for backtesting
+Live/dry instances will download the data automatically for the user, but users who wish to use backtesting functionality still need to download the necessary data using `download-data` (details [here](data-download/#data-downloading)). FreqAI users need to pay careful attention to understanding how much *additional* data needs to be downloaded to ensure that they have a sufficient amount of training data *before* the start of their backtesting timerange. The amount of additional data can be roughly estimated by taking subtracting `train_period_days` and the `startup_candle_count` ([details](#setting-the-startupcandlecount)) from the beginning of the desired backtesting timerange. 
+
+As an example, if we wish to backtest the `--timerange` above of `20210501-20210701`, and we use the example config which sets `train_period_days` to 15. The startup candle count is 40 on a maximum `include_timeframes` of 1h. We would need 20210501 - 15 days - 40 * 1h / 24 hours = 20210414 (16.7 days earlier than the start of the desired training timerange).
+
 ### Defining model expirations
 
 During dry/live mode, FreqAI trains each coin pair sequentially (on separate threads/GPU from the main Freqtrade bot). This means that there is always an age discrepancy between models. If a user is training on 50 pairs, and each pair requires 5 minutes to train, the oldest model will be over 4 hours old. This may be undesirable if the characteristic time scale (the trade duration target) for a strategy is less than 4 hours. The user can decide to only make trade entries if the model is less than
diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index f88e20223..13af1e0d2 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -70,7 +70,7 @@ class FreqaiDataKitchen:
         self.training_features_list: List = []
         self.model_filename: str = ""
         self.backtesting_results_path = Path()
-        self.backtesting_prediction_folder: str = "backtesting_predictions"
+        self.backtest_predictions_folder: str = "backtesting_predictions"
         self.live = live
         self.pair = pair
 
@@ -1077,7 +1077,7 @@ class FreqaiDataKitchen:
         Save prediction dataframe from backtesting to h5 file format
         :param append_df: dataframe for backtesting period
         """
-        full_predictions_folder = Path(self.full_path / self.backtesting_prediction_folder)
+        full_predictions_folder = Path(self.full_path / self.backtest_predictions_folder)
         if not full_predictions_folder.is_dir():
             full_predictions_folder.mkdir(parents=True, exist_ok=True)
 
@@ -1092,3 +1092,26 @@ class FreqaiDataKitchen:
         """
         append_df = pd.read_hdf(self.backtesting_results_path)
         return append_df
+
+    def check_if_backtest_prediction_exists(
+        self
+    ) -> bool:
+        """
+        Check if a backtesting prediction already exists
+        :param dk: FreqaiDataKitchen
+        :return:
+        :boolean: whether the prediction file exists or not.
+        """
+        path_to_predictionfile = Path(self.full_path /
+                                      self.backtest_predictions_folder /
+                                      f"{self.model_filename}_prediction.h5")
+        self.backtesting_results_path = path_to_predictionfile
+
+        file_exists = path_to_predictionfile.is_file()
+        if file_exists:
+            logger.info(f"Found backtesting prediction file at {path_to_predictionfile}")
+        else:
+            logger.info(
+                f"Could not find backtesting prediction file at {path_to_predictionfile}"
+            )
+        return file_exists
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index 9c7ef05a7..399568c7d 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -71,7 +71,9 @@ class IFreqaiModel(ABC):
         self.first = True
         self.set_full_path()
         self.follow_mode: bool = self.freqai_info.get("follow_mode", False)
-        self.backtest_save_model: bool = self.freqai_info.get("backtest_save_model", True)
+        self.save_backtest_models: bool = self.freqai_info.get("save_backtest_models", False)
+        if self.save_backtest_models:
+            logger.info('Backtesting module configured to save all models.')
         self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
         self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
         self.scanning = False
@@ -125,10 +127,9 @@ class IFreqaiModel(ABC):
         elif not self.follow_mode:
             self.dk = FreqaiDataKitchen(self.config, self.live, metadata["pair"])
             logger.info(f"Training {len(self.dk.training_timeranges)} timeranges")
-            with self.analysis_lock:
-                dataframe = self.dk.use_strategy_to_populate_indicators(
-                    strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
-                )
+            dataframe = self.dk.use_strategy_to_populate_indicators(
+                strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
+            )
             dk = self.start_backtesting(dataframe, metadata, self.dk)
 
         dataframe = dk.remove_features_from_df(dk.return_dataframe)
@@ -232,10 +233,9 @@ class IFreqaiModel(ABC):
                 f"sub-train-{metadata['pair'].split('/')[0]}_{trained_timestamp_int}"
                 )
 
-            coin, _ = metadata["pair"].split("/")
-            dk.model_filename = f"cb_{coin.lower()}_{trained_timestamp_int}"
+            dk.set_new_model_names(metadata["pair"], trained_timestamp)
 
-            if self.backtest_prediction_exists(dk):
+            if dk.check_if_backtest_prediction_exists():
                 append_df = dk.get_backtesting_prediction()
                 dk.append_predictions(append_df)
             else:
@@ -246,8 +246,9 @@ class IFreqaiModel(ABC):
                     self.model = self.train(dataframe_train, metadata["pair"], dk)
                     self.dd.pair_dict[metadata["pair"]]["trained_timestamp"] = int(
                         trained_timestamp.stopts)
-                    dk.set_new_model_names(metadata["pair"], trained_timestamp)
-                    if self.backtest_save_model:
+
+                    if self.save_backtest_models:
+                        logger.info('Saving backtest model to disk.')
                         self.dd.save_data(self.model, metadata["pair"], dk)
                 else:
                     self.model = self.dd.load_data(metadata["pair"], dk)
@@ -644,35 +645,6 @@ class IFreqaiModel(ABC):
                 self.train_time = 0
         return
 
-    def backtest_prediction_exists(
-        self,
-        dk: FreqaiDataKitchen,
-        scanning: bool = False,
-    ) -> bool:
-        """
-        Check if a backtesting prediction already exists
-        :param dk: FreqaiDataKitchen
-        :return:
-        :boolean: whether the prediction file exists or not.
-        """
-        if not self.live:
-            prediction_file_name = dk.model_filename
-            path_to_predictionfile = Path(dk.full_path /
-                                          dk.backtesting_prediction_folder /
-                                          f"{prediction_file_name}_prediction.h5")
-            dk.backtesting_results_path = path_to_predictionfile
-
-            file_exists = path_to_predictionfile.is_file()
-            if file_exists and not scanning:
-                logger.info("Found backtesting prediction file at %s", prediction_file_name)
-            elif not scanning:
-                logger.info(
-                    "Could not find backtesting prediction file at %s", prediction_file_name
-                )
-            return file_exists
-        else:
-            return False
-
     # Following methods which are overridden by user made prediction models.
     # See freqai/prediction_models/CatboostPredictionModel.py for an example.
 
diff --git a/freqtrade/templates/FreqaiExampleStrategy.py b/freqtrade/templates/FreqaiExampleStrategy.py
index aa584bfbc..0e822a028 100644
--- a/freqtrade/templates/FreqaiExampleStrategy.py
+++ b/freqtrade/templates/FreqaiExampleStrategy.py
@@ -44,7 +44,7 @@ class FreqaiExampleStrategy(IStrategy):
     stoploss = -0.05
     use_exit_signal = True
     # this is the maximum period fed to talib (timeframe independent)
-    startup_candle_count: int = 20
+    startup_candle_count: int = 40
     can_short = False
 
     linear_roi_offset = DecimalParameter(
diff --git a/tests/freqai/test_freqai_interface.py b/tests/freqai/test_freqai_interface.py
index 09f5d27ff..5441b3c24 100644
--- a/tests/freqai/test_freqai_interface.py
+++ b/tests/freqai/test_freqai_interface.py
@@ -174,6 +174,7 @@ def test_train_model_in_series_LightGBMClassifier(mocker, freqai_conf):
 
 def test_start_backtesting(mocker, freqai_conf):
     freqai_conf.update({"timerange": "20180120-20180130"})
+    freqai_conf.get("freqai", {}).update({"save_backtest_models": True})
     strategy = get_patched_freqai_strategy(mocker, freqai_conf)
     exchange = get_patched_exchange(mocker, freqai_conf)
     strategy.dp = DataProvider(freqai_conf, exchange)
@@ -200,6 +201,7 @@ def test_start_backtesting(mocker, freqai_conf):
 def test_start_backtesting_subdaily_backtest_period(mocker, freqai_conf):
     freqai_conf.update({"timerange": "20180120-20180124"})
     freqai_conf.get("freqai", {}).update({"backtest_period_days": 0.5})
+    freqai_conf.get("freqai", {}).update({"save_backtest_models": True})
     strategy = get_patched_freqai_strategy(mocker, freqai_conf)
     exchange = get_patched_exchange(mocker, freqai_conf)
     strategy.dp = DataProvider(freqai_conf, exchange)
@@ -224,6 +226,7 @@ def test_start_backtesting_subdaily_backtest_period(mocker, freqai_conf):
 
 def test_start_backtesting_from_existing_folder(mocker, freqai_conf, caplog):
     freqai_conf.update({"timerange": "20180120-20180130"})
+    freqai_conf.get("freqai", {}).update({"save_backtest_models": True})
     strategy = get_patched_freqai_strategy(mocker, freqai_conf)
     exchange = get_patched_exchange(mocker, freqai_conf)
     strategy.dp = DataProvider(freqai_conf, exchange)
@@ -263,10 +266,14 @@ def test_start_backtesting_from_existing_folder(mocker, freqai_conf, caplog):
     freqai.start_backtesting(df, metadata, freqai.dk)
 
     assert log_has_re(
-        "Found backtesting prediction ",
+        "Found backtesting prediction file ",
         caplog,
     )
 
+    path = (freqai.dd.full_path / freqai.dk.backtest_predictions_folder)
+    prediction_files = [x for x in path.iterdir() if x.is_file()]
+    assert len(prediction_files) == 5
+
     shutil.rmtree(Path(freqai.dk.full_path))