diff --git a/.gitignore b/.gitignore index 97f77f779..a066001db 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,10 @@ logfile.txt user_data/* !user_data/strategy/sample_strategy.py !user_data/notebooks +!user_data/models +!user_data/freqaimodels +user_data/freqaimodels/* +user_data/models/* user_data/notebooks/* freqtrade-plot.html freqtrade-profit-plot.html @@ -105,3 +109,5 @@ target/ !config_examples/config_ftx.example.json !config_examples/config_full.example.json !config_examples/config_kraken.example.json +!config_examples/config_freqai_futures.example.json +!config_examples/config_freqai_spot.example.json diff --git a/config_examples/config_freqai_futures.example.json b/config_examples/config_freqai_futures.example.json new file mode 100644 index 000000000..e5207a906 --- /dev/null +++ b/config_examples/config_freqai_futures.example.json @@ -0,0 +1,102 @@ +{ + "trading_mode": "futures", + "margin_mode": "isolated", + "max_open_trades": 5, + "stake_currency": "USDT", + "stake_amount": 200, + "tradable_balance_ratio": 1, + "fiat_display_currency": "USD", + "dry_run": true, + "timeframe": "3m", + "dry_run_wallet": 1000, + "cancel_open_orders_on_exit": true, + "unfilledtimeout": { + "entry": 10, + "exit": 30 + }, + "exchange": { + "name": "okx", + "key": "", + "secret": "", + "ccxt_config": { + "enableRateLimit": true + }, + "ccxt_async_config": { + "enableRateLimit": true, + "rateLimit": 200 + }, + "pair_whitelist": [ + "AGLD/USDT:USDT", + "1INCH/USDT:USDT", + "AAVE/USDT:USDT", + "ALGO/USDT:USDT", + "ALPHA/USDT:USDT", + "API3/USDT:USDT", + "AVAX/USDT:USDT", + "AXS/USDT:USDT", + "BCH/USDT:USDT" + ], + "pair_blacklist": [] + }, + "entry_pricing": { + "price_side": "same", + "use_order_book": true, + "order_book_top": 1, + "price_last_balance": 0.0, + "check_depth_of_market": { + "enabled": false, + "bids_to_ask_delta": 1 + } + }, + "exit_pricing": { + "price_side": "other", + "use_order_book": true, + "order_book_top": 1 + }, + "pairlists": [ + { + "method": "StaticPairList" + } + ], + "freqai": { + "startup_candles": 10000, + "timeframes": [ + "3m", + "15m", + "1h" + ], + "train_period": 20, + "backtest_period": 0.001, + "identifier": "constant_retrain_live", + "live_trained_timestamp": 0, + "corr_pairlist": [ + "BTC/USDT:USDT", + "ETH/USDT:USDT" + ], + "feature_parameters": { + "period": 20, + "shift": 2, + "DI_threshold": 0.9, + "weight_factor": 0.9, + "principal_component_analysis": false, + "use_SVM_to_remove_outliers": true, + "stratify": 0, + "indicator_max_period": 20, + "indicator_periods": [10, 20] + }, + "data_split_parameters": { + "test_size": 0.33, + "random_state": 1 + }, + "model_training_parameters": { + "n_estimators": 1000, + "task_type": "CPU" + } + }, + "bot_name": "", + "force_entry_enable": true, + "initial_state": "running", + "internals": { + "process_throttle_secs": 5 + } +} diff --git a/config_examples/config_freqai_spot.example.json b/config_examples/config_freqai_spot.example.json new file mode 100644 index 000000000..7ff01cc07 --- /dev/null +++ b/config_examples/config_freqai_spot.example.json @@ -0,0 +1,97 @@ +{ + "max_open_trades": 1, + "stake_currency": "USDT", + "stake_amount": 900, + "tradable_balance_ratio": 1, + "fiat_display_currency": "USD", + "dry_run": true, + "timeframe": "5m", + "dry_run_wallet": 4000, + "dataformat_ohlcv": "json", + "cancel_open_orders_on_exit": true, + "unfilledtimeout": { + "entry": 10, + "exit": 30 + }, + "exchange": { + "name": "binance", + "key": "", + "secret": "", + "ccxt_config": { + "enableRateLimit": true + }, + "ccxt_async_config": { + "enableRateLimit": true, + "rateLimit": 200 + }, + "pair_whitelist": [ + "BTC/USDT", + "ETH/USDT" + ], + "pair_blacklist": [] + }, + "entry_pricing": { + "price_side": "same", + "use_order_book": true, + "order_book_top": 1, + "price_last_balance": 0.0, + "check_depth_of_market": { + "enabled": false, + "bids_to_ask_delta": 1 + } + }, + "exit_pricing": { + "price_side": "other", + "use_order_book": true, + "order_book_top": 1 + }, + "pairlists": [ + { + "method": "StaticPairList" + } + ], + "freqai": { + "startup_candles": 10000, + "timeframes": [ + "5m", + "15m", + "4h" + ], + "train_period": 30, + "backtest_period": 7, + "identifier": "example", + "live_trained_timestamp": 0, + "corr_pairlist": [ + "BTC/USDT", + "ETH/USDT", + "DOT/USDT", + "MATIC/USDT", + "SOL/USDT" + ], + "feature_parameters": { + "period": 500, + "shift": 1, + "DI_threshold": 0, + "weight_factor": 0.9, + "principal_component_analysis": false, + "use_SVM_to_remove_outliers": false, + "stratify": 0, + "indicator_max_period": 50, + "indicator_periods": [10, 20] + }, + "data_split_parameters": { + "test_size": 0.33, + "random_state": 1 + }, + "model_training_parameters": { + "n_estimators": 1000, + "task_type": "CPU" + } + }, + "bot_name": "", + "initial_state": "running", + "forcebuy_enable": false, + "internals": { + "process_throttle_secs": 5 + } +} diff --git a/docker/Dockerfile.freqai b/docker/Dockerfile.freqai new file mode 100644 index 000000000..98e6e31d5 --- /dev/null +++ b/docker/Dockerfile.freqai @@ -0,0 +1,17 @@ +ARG sourceimage=freqtradeorg/freqtrade +ARG sourcetag=develop +FROM ${sourceimage}:${sourcetag} + +USER root + +RUN apt-get install -y libgomp1 + +USER ftuser + +# Install dependencies +COPY requirements-freqai.txt /freqtrade/ + +RUN pip install -r requirements-freqai.txt --user --no-cache-dir +# Temporary step - as the source image will contain the wrong (non-freqai) sourcecode +COPY --chown=ftuser:ftuser . /freqtrade/ + diff --git a/docs/assets/weights_factor.png b/docs/assets/weights_factor.png new file mode 100644 index 000000000..1171a49ba Binary files /dev/null and b/docs/assets/weights_factor.png differ diff --git a/docs/freqai.md b/docs/freqai.md new file mode 100644 index 000000000..647ffcecd --- /dev/null +++ b/docs/freqai.md @@ -0,0 +1,534 @@ +# Freqai + +!!! Note + Freqai is still experimental, and should be used at the user's own discretion. + +Freqai is a module designed to automate a variety of tasks associated with +training a predictive model to provide signals based on input features. + +Among the the features included: + +* Easy large feature set construction based on simple user input +* Sweep model training and backtesting to simulate consistent model retraining through time +* Smart outlier removal of data points from prediction sets using a Dissimilarity Index. +* Data dimensionality reduction with Principal Component Analysis +* Automatic file management for storage of models to be reused during live +* Smart and safe data standardization +* Cleaning of NaNs from the data set before training and prediction. +* Automated live retraining (still VERY experimental. Proceed with caution.) + +## General approach + +The user provides FreqAI with a set of custom indicators (created inside the strategy the same way +a typical Freqtrade strategy is created) as well as a target value (typically some price change into +the future). FreqAI trains a model to predict the target value based on the input of custom indicators. +FreqAI will train and save a new model for each pair in the config whitelist. +Users employ FreqAI to backtest a strategy (emulate reality with retraining a model as new data is +introduced) and run the model live to generate buy and sell signals. + +## Background and vocabulary + +**Features** are the quantities with which a model is trained. $X_i$ represents the +vector of all features for a single candle. In Freqai, the user +builds the features from anything they can construct in the strategy. + +**Labels** are the target values with which the weights inside a model are trained +toward. Each set of features is associated with a single label, which is also +defined within the strategy by the user. These labels look forward into the +future, and are not available to the model during dryrun/live/backtesting. + +**Training** refers to the process of feeding individual feature sets into the +model with associated labels with the goal of matching input feature sets to +associated labels. + +**Train data** is a subset of the historic data which is fed to the model during +training to adjust weights. This data directly influences weight connections +in the model. + +**Test data** is a subset of the historic data which is used to evaluate the +intermediate performance of the model during training. This data does not +directly influence nodal weights within the model. + +## Install prerequisites + +Use `pip` to install the prerequisites with: + +`pip install -r requirements-freqai.txt` + +## Running from the example files + +An example strategy, an example prediction model, and example config can all be found in +`freqtrade/templates/ExampleFreqaiStrategy.py`, +`freqtrade/freqai/prediction_models/CatboostPredictionModel.py`, +`config_examples/config_freqai.example.json`, respectively. Assuming the user has downloaded +the necessary data, Freqai can be executed from these templates with: + +```bash +freqtrade backtesting --config config_examples/config_freqai.example.json --strategy FreqaiExampleStrategy --freqaimodel CatboostPredictionModel --strategy-path freqtrade/templates --timerange 20220101-20220201 +``` + +## Configuring the bot + +### Example config file + +The user interface is isolated to the typical config file. A typical Freqai +config setup includes: + +```json + "freqai": { + "startup_candles": 10000, + "timeframes" : ["5m","15m","4h"], + "train_period" : 30, + "backtest_period" : 7, + "identifier" : "unique-id", + "corr_pairlist": [ + "ETH/USD", + "LINK/USD", + "BNB/USD" + ], + "feature_parameters" : { + "period": 24, + "shift": 2, + "weight_factor": 0, + }, + "data_split_parameters" : { + "test_size": 0.25, + "random_state": 42 + }, + "model_training_parameters" : { + "n_estimators": 100, + "random_state": 42, + "learning_rate": 0.02, + "task_type": "CPU", + }, + } +``` + +### Building the feature set + +!! slightly out of date, please refer to templates/FreqaiExampleStrategy.py for updated method !! +Features are added by the user inside the `populate_any_indicators()` method of the strategy +by prepending indicators with `%`: + +```python + def populate_any_indicators(self, metadata, pair, df, tf, informative=None, coin=""): + informative['%-' + coin + "rsi"] = ta.RSI(informative, timeperiod=14) + informative['%-' + coin + "mfi"] = ta.MFI(informative, timeperiod=25) + informative['%-' + coin + "adx"] = ta.ADX(informative, window=20) + bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(informative), window=14, stds=2.2) + informative[coin + "bb_lowerband"] = bollinger["lower"] + informative[coin + "bb_middleband"] = bollinger["mid"] + informative[coin + "bb_upperband"] = bollinger["upper"] + informative['%-' + coin + "bb_width"] = ( + informative[coin + "bb_upperband"] - informative[coin + "bb_lowerband"] + ) / informative[coin + "bb_middleband"] + + + + # The following code automatically adds features according to the `shift` parameter passed + # in the config. Do not remove + indicators = [col for col in informative if col.startswith('%')] + for n in range(self.freqai_info["feature_parameters"]["shift"] + 1): + if n == 0: + continue + informative_shift = informative[indicators].shift(n) + informative_shift = informative_shift.add_suffix("_shift-" + str(n)) + informative = pd.concat((informative, informative_shift), axis=1) + + # The following code safely merges into the base timeframe. + # Do not remove. + df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True) + skip_columns = [(s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"]] + df = df.drop(columns=skip_columns) +``` +The user of the present example does not want to pass the `bb_lowerband` as a feature to the model, +and has therefore not prepended it with `%`. The user does, however, wish to pass `bb_width` to the +model for training/prediction and has therfore prepended it with `%`._ + +Note: features **must** be defined in `populate_any_indicators()`. Making features in `populate_indicators()` +will fail in live/dry. If the user wishes to add generalized features that are not associated with +a specific pair or timeframe, they should use the following structure inside `populate_any_indicators()` +(as exemplified in `freqtrade/templates/FreqaiExampleStrategy.py`: + +```python + def populate_any_indicators(self, metadata, pair, df, tf, informative=None, coin=""): + + + # Add generalized indicators here (because in live, it will call only this function to populate + # indicators for retraining). Notice how we ensure not to add them multiple times by associating + # these generalized indicators to the basepair/timeframe + if pair == metadata['pair'] and tf == self.timeframe: + df['%-day_of_week'] = (df["date"].dt.dayofweek + 1) / 7 + df['%-hour_of_day'] = (df['date'].dt.hour + 1) / 25 +``` + +(Please see the example script located in `freqtrade/templates/FreqaiExampleStrategy.py` for a full example of `populate_any_indicators()`) + +The `timeframes` from the example config above are the timeframes of each `populate_any_indicator()` + included metric for inclusion in the feature set. In the present case, the user is asking for the +`5m`, `15m`, and `4h` timeframes of the `rsi`, `mfi`, `roc`, and `bb_width` to be included +in the feature set. + +In addition, the user can ask for each of these features to be included from +informative pairs using the `corr_pairlist`. This means that the present feature +set will include all the `base_features` on all the `timeframes` for each of +`ETH/USD`, `LINK/USD`, and `BNB/USD`. + +`shift` is another user controlled parameter which indicates the number of previous +candles to include in the present feature set. In other words, `shift: 2`, tells +Freqai to include the the past 2 candles for each of the features included +in the dataset. + +In total, the number of features the present user has created is:_ + +no. `timeframes` * no. `base_features` * no. `corr_pairlist` * no. `shift`_ +3 * 3 * 3 * 2 = 54._ + +### Deciding the sliding training window and backtesting duration + +Users define the backtesting timerange with the typical `--timerange` parameter in the user +configuration file. `train_period` is the duration of the sliding training window, while +`backtest_period` is the sliding backtesting window, both in number of days (backtest_period can be +a float to indicate sub daily retraining in live/dry mode). In the present example, +the user is asking Freqai to use a training period of 30 days and backtest the subsequent 7 days. +This means that if the user sets `--timerange 20210501-20210701`, +Freqai will train 8 separate models (because the full range comprises 8 weeks), +and then backtest the subsequent week associated with each of the 8 training +data set timerange months. Users can think of this as a "sliding window" which +emulates Freqai retraining itself once per week in live using the previous +month of data._ + +In live, the required training data is automatically computed and downloaded. However, in backtesting +the user must manually enter the required number of `startup_candles` in the config. This value +is used to increase the available data to FreqAI and should be sufficient to enable all indicators +to be NaN free at the beginning of the first training timerange. This boils down to identifying the +highest timeframe (`4h` in present example) and the longest indicator period (25 in present example) +and adding this to the `train_period`. The units need to be in the base candle time frame:_ + +`startup_candles` = ( 4 hours * 25 max period * 60 minutes/hour + 30 day train_period * 1440 minutes per day ) / 5 min (base time frame) = 1488. + +!!! Note: in dry/live, this is all precomputed and handled automatically. Thus, `startup_candle` has no +influence on dry/live. + +## Running Freqai + +### Training and backtesting + +The freqai training/backtesting module can be executed with the following command: + +```bash +freqtrade backtesting --strategy FreqaiExampleStrategy --config config_freqai.example.json --freqaimodel CatboostPredictionModel --timerange 20210501-20210701 +``` + +If this command has never been executed with the existing config file, then it will train a new model +for each pair, for each backtesting window within the bigger `--timerange`._ + +--- +**NOTE** +Once the training is completed, the user can execute this again with the same config file and +FreqAI will find the trained models and load them instead of spending time training. This is useful +if the user wants to tweak (or even hyperopt) buy and sell criteria inside the strategy. IF the user +*wants* to retrain a new model with the same config file, then he/she should simply change the `identifier`. +This way, the user can return to using any model they wish by simply changing the `identifier`. + +--- + +### Building a freqai strategy + +The Freqai strategy requires the user to include the following lines of code in the strategy: + +```python + from freqtrade.freqai.strategy_bridge import CustomModel + + def informative_pairs(self): + whitelist_pairs = self.dp.current_whitelist() + corr_pairs = self.config["freqai"]["corr_pairlist"] + informative_pairs = [] + for tf in self.config["freqai"]["timeframes"]: + for pair in whitelist_pairs: + informative_pairs.append((pair, tf)) + for pair in corr_pairs: + if pair in whitelist_pairs: + continue # avoid duplication + informative_pairs.append((pair, tf)) + return informative_pairs + + def bot_start(self): + self.model = CustomModel(self.config) + + def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame: + self.freqai_info = self.config['freqai'] + + # the following loops are necessary for building the features + # indicated by the user in the configuration file. + for tf in self.freqai_info['timeframes']: + for i in self.freqai_info['corr_pairlist']: + dataframe = self.populate_any_indicators(i, + dataframe.copy(), tf, coin=i.split("/")[0]+'-') + + # the model will return 4 values, its prediction, an indication of whether or not the prediction + # should be accepted, the target mean/std values from the labels used during each training period. + (dataframe['prediction'], dataframe['do_predict'], + dataframe['target_mean'], dataframe['target_std']) = self.model.bridge.start(dataframe, metadata) + + return dataframe +``` + +The user should also include `populate_any_indicators()` from `templates/FreqaiExampleStrategy.py` which builds +the feature set with a proper naming convention for the IFreqaiModel to use later. + +### Building an IFreqaiModel + +Freqai has an example prediction model based on the popular `Catboost` regression (`freqai/prediction_models/CatboostPredictionModel.py`). However, users can customize and create +their own prediction models using the `IFreqaiModel` class. Users are encouraged to inherit `train()`, `predict()`, +and `make_labels()` to let them customize various aspects of their training procedures. + +### Running the model live + +Freqai can be run dry/live using the following command + +```bash +freqtrade trade --strategy FreqaiExampleStrategy --config config_freqai.example.json --freqaimodel ExamplePredictionModel +``` + +By default, Freqai will not find find any existing models and will start by training a new one +given the user configuration settings. Following training, it will use that model to predict for the +duration of `backtest_period`. After a full `backtest_period` has elapsed, Freqai will auto retrain +a new model, and begin making predictions with the updated model. FreqAI backtesting and live both +permit the user to use fractional days (i.e. 0.1) in the `backtest_period`, which enables more frequent +retraining. But the user should be careful that using a fractional `backtest_period` with a large +`--timerange` in backtesting will result in a huge amount of required trainings/models. + +If the user wishes to start dry/live from a backtested saved model, the user only needs to reuse +the same `identifier` parameter + +```json + "freqai": { + "identifier": "example", + } +``` + +In this case, although Freqai will initiate with a +pre-trained model, it will still check to see how much time has elapsed since the model was trained, +and if a full `backtest_period` has elapsed since the end of the loaded model, FreqAI will self retrain. + +## Data anylsis techniques + +### Controlling the model learning process + +The user can define model settings for the data split `data_split_parameters` and learning parameters +`model_training_parameters`. Users are encouraged to visit the Catboost documentation +for more information on how to select these values. `n_estimators` increases the +computational effort and the fit to the training data. If a user has a GPU +installed in their system, they may benefit from changing `task_type` to `GPU`. +The `weight_factor` allows the user to weight more recent data more strongly +than past data via an exponential function: + +$$ W_i = \exp(\frac{-i}{\alpha*n}) $$ + +where $W_i$ is the weight of data point $i$ in a total set of $n$ data points._ + +![weight-factor](assets/weights_factor.png) + +Finally, `period` defines the offset used for the `labels`. In the present example, +the user is asking for `labels` that are 24 candles in the future. + +### Removing outliers with the Dissimilarity Index + +The Dissimilarity Index (DI) aims to quantify the uncertainty associated with each +prediction by the model. To do so, Freqai measures the distance between each training +data point and all other training data points: + +$$ d_{ab} = \sqrt{\sum_{j=1}^p(X_{a,j}-X_{b,j})^2} $$ + +where $d_{ab}$ is the distance between the normalized points $a$ and $b$. $p$ +is the number of features i.e. the length of the vector $X$. The +characteristic distance, $\overline{d}$ for a set of training data points is simply the mean +of the average distances: + +$$ \overline{d} = \sum_{a=1}^n(\sum_{b=1}^n(d_{ab}/n)/n) $$ + +$\overline{d}$ quantifies the spread of the training data, which is compared to +the distance between the new prediction feature vectors, $X_k$ and all the training +data: + +$$ d_k = \argmin_i d_{k,i} $$ + +which enables the estimation of a Dissimilarity Index: + +$$ DI_k = d_k/\overline{d} $$ + +Equity and crypto markets suffer from a high level of non-patterned noise in the +form of outlier data points. The dissimilarity index allows predictions which +are outliers and not existent in the model feature space, to be thrown out due +to low levels of certainty. Activating the Dissimilarity Index can be achieved with: + +```json + "freqai": { + "feature_parameters" : { + "DI_threshold": 1 + } + } +``` + +The user can tweak the DI with `DI_threshold` to increase or decrease the extrapolation of the +trained model. + +### Reducing data dimensionality with Principal Component Analysis + +Users can reduce the dimensionality of their features by activating the `principal_component_analysis`: + +```json + "freqai": { + "feature_parameters" : { + "principal_component_analysis": true + } + } +``` + +Which will perform PCA on the features and reduce the dimensionality of the data so that the explained +variance of the data set is >= 0.999. + +### Removing outliers using a Support Vector Machine (SVM) + +The user can tell Freqai to remove outlier data points from the training/test data sets by setting: + +```json + "freqai": { + "feature_parameters" : { + "use_SVM_to_remove_outliers: true + } + } +``` + +Freqai will train an SVM on the training data (or components if the user activated +`principal_component_analysis`) and remove any data point that it deems to be sit beyond the +feature space. + +### Stratifying the data + +The user can stratify the training/testing data using: + +```json + "freqai": { + "feature_parameters" : { + "stratify": 3 + } + } +``` + +which will split the data chronologically so that every Xth data points is a testing data point. In the +present example, the user is asking for every third data point in the dataframe to be used for +testing, the other points are used for training. + +### Setting up a follower + +The user can define: + +```json + "freqai": { + "follow_mode": true, + "identifier": "example" + } +``` + +to indicate to the bot that it should not train models, but instead should look for models trained +by a leader with the same `identifier`. In this example, the user has a leader bot with the +`identifier: "example"` already running or launching simultaneously as the present follower. +The follower will load models created by the leader and inference them to obtain predictions. + +### Purging old model data + +FreqAI stores new model files each time it retrains. These files become obsolete as new models +are trained and FreqAI adapts to the new market conditions. Users planning to leave FreqAI running +for extended periods of time with high frequency retraining should set `purge_old_models` in their +config: + +```json + "freqai": { + "purge_old_models": true, + } +``` + +which will automatically purge all models older than the two most recently trained ones. + +## Defining model expirations + +During dry/live, FreqAI trains each pair sequentially (on separate threads/GPU from the main +Freqtrade bot). This means there is always an age discrepancy between models. If a user is training +on 50 pairs, and each pair requires 5 minutes to train, the oldest model will be over 4 hours old. +This may be undesirable if the characteristic time scale (read trade duration target) for a strategy +is much less than 4 hours. The user can decide to only make trade entries if the model is less than +a certain number of hours in age by setting the `expiration_hours` in the config file: + +```json + "freqai": { + "expiration_hours": 0.5, + } +``` + +In the present example, the user will only allow predictions on models that are less than 1/2 hours +old. + + + + + +## Additional information + +### Feature normalization + +The feature set created by the user is automatically normalized to the training +data only. This includes all test data and unseen prediction data (dry/live/backtest). + +### File structure + +`user_data_dir/models/` contains all the data associated with the trainings and +backtests. This file structure is heavily controlled and read by the `FreqaiDataKitchen()` +and should thus not be modified. diff --git a/freqtrade/commands/arguments.py b/freqtrade/commands/arguments.py index 1e3e2845a..661fc6fcb 100644 --- a/freqtrade/commands/arguments.py +++ b/freqtrade/commands/arguments.py @@ -12,7 +12,8 @@ from freqtrade.constants import DEFAULT_CONFIG ARGS_COMMON = ["verbosity", "logfile", "version", "config", "datadir", "user_data_dir"] -ARGS_STRATEGY = ["strategy", "strategy_path", "recursive_strategy_search"] +ARGS_STRATEGY = ["strategy", "strategy_path", "recursive_strategy_search", "freqaimodel", + "freqaimodel_path"] ARGS_TRADE = ["db_url", "sd_notify", "dry_run", "dry_run_wallet", "fee"] diff --git a/freqtrade/commands/cli_options.py b/freqtrade/commands/cli_options.py index 3370ce64b..f85b75af1 100644 --- a/freqtrade/commands/cli_options.py +++ b/freqtrade/commands/cli_options.py @@ -647,4 +647,14 @@ AVAILABLE_CLI_OPTIONS = { nargs='+', default=[], ), + "freqaimodel": Arg( + '--freqaimodel', + help='Specify a custom freqaimodels.', + metavar='NAME', + ), + "freqaimodel_path": Arg( + '--freqaimodel-path', + help='Specify additional lookup path for freqaimodels.', + metavar='PATH', + ), } diff --git a/freqtrade/commands/data_commands.py b/freqtrade/commands/data_commands.py index 61a99782e..311590e64 100644 --- a/freqtrade/commands/data_commands.py +++ b/freqtrade/commands/data_commands.py @@ -12,7 +12,7 @@ from freqtrade.enums import CandleType, RunMode, TradingMode from freqtrade.exceptions import OperationalException from freqtrade.exchange import timeframe_to_minutes from freqtrade.exchange.exchange import market_is_active -from freqtrade.plugins.pairlist.pairlist_helpers import expand_pairlist +from freqtrade.plugins.pairlist.pairlist_helpers import dynamic_expand_pairlist, expand_pairlist from freqtrade.resolvers import ExchangeResolver @@ -50,7 +50,8 @@ def start_download_data(args: Dict[str, Any]) -> None: exchange = ExchangeResolver.load_exchange(config['exchange']['name'], config, validate=False) markets = [p for p, m in exchange.markets.items() if market_is_active(m) or config.get('include_inactive')] - expanded_pairs = expand_pairlist(config['pairs'], markets) + + expanded_pairs = dynamic_expand_pairlist(config, markets) # Manual validations of relevant settings if not config['exchange'].get('skip_pair_validation', False): diff --git a/freqtrade/configuration/config_validation.py b/freqtrade/configuration/config_validation.py index ee846e7e6..5f1f68554 100644 --- a/freqtrade/configuration/config_validation.py +++ b/freqtrade/configuration/config_validation.py @@ -85,6 +85,7 @@ def validate_config_consistency(conf: Dict[str, Any], preliminary: bool = False) _validate_unlimited_amount(conf) _validate_ask_orderbook(conf) validate_migrated_strategy_settings(conf) + _validate_freqai(conf) # validate configuration before returning logger.info('Validating configuration ...') @@ -163,6 +164,21 @@ def _validate_edge(conf: Dict[str, Any]) -> None: ) +def _validate_freqai(conf: Dict[str, Any]) -> None: + """ + Freqai param validator + """ + + if not conf.get('freqai', {}): + return + + for param in constants.SCHEMA_FREQAI_REQUIRED: + if param not in conf.get('freqai', {}): + raise OperationalException( + f'{param} not found in Freqai config' + ) + + def _validate_whitelist(conf: Dict[str, Any]) -> None: """ Dynamic whitelist does not require pair_whitelist to be set - however StaticWhitelist does. diff --git a/freqtrade/configuration/configuration.py b/freqtrade/configuration/configuration.py index d46d54cb0..b4f36aa3c 100644 --- a/freqtrade/configuration/configuration.py +++ b/freqtrade/configuration/configuration.py @@ -97,6 +97,8 @@ class Configuration: self._process_analyze_options(config) + self._process_freqai_options(config) + # Check if the exchange set by the user is supported check_exchange(config, config.get('experimental', {}).get('block_bad_exchanges', True)) @@ -461,6 +463,16 @@ class Configuration: config.update({'runmode': self.runmode}) + def _process_freqai_options(self, config: Dict[str, Any]) -> None: + + self._args_to_config(config, argname='freqaimodel', + logstring='Using freqaimodel class name: {}') + + self._args_to_config(config, argname='freqaimodel_path', + logstring='Using freqaimodel path: {}') + + return + def _args_to_config(self, config: Dict[str, Any], argname: str, logstring: str, logfun: Optional[Callable] = None, deprecated_msg: Optional[str] = None) -> None: diff --git a/freqtrade/constants.py b/freqtrade/constants.py index 18dbea259..1b5f01d8b 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -55,6 +55,7 @@ FTHYPT_FILEVERSION = 'fthypt_fileversion' USERPATH_HYPEROPTS = 'hyperopts' USERPATH_STRATEGIES = 'strategies' USERPATH_NOTEBOOKS = 'notebooks' +USERPATH_FREQAIMODELS = 'freqaimodels' TELEGRAM_SETTING_OPTIONS = ['on', 'off', 'silent'] WEBHOOK_FORMAT_OPTIONS = ['form', 'json', 'raw'] @@ -472,7 +473,44 @@ CONF_SCHEMA = { 'remove_pumps': {'type': 'boolean'} }, 'required': ['process_throttle_secs', 'allowed_risk'] - } + }, + "freqai": { + "type": "object", + "properties": { + "timeframes": {"type": "list"}, + "train_period": {"type": "integer", "default": 0}, + "backtest_period": {"type": "float", "default": 7}, + "identifier": {"type": "str", "default": "example"}, + "corr_pairlist": {"type": "list"}, + "feature_parameters": { + "type": "object", + "properties": { + "period": {"type": "integer"}, + "shift": {"type": "integer", "default": 0}, + "DI_threshold": {"type": "float", "default": 0}, + "weight_factor": {"type": "number", "default": 0}, + "principal_component_analysis": {"type": "boolean", "default": False}, + "use_SVM_to_remove_outliers": {"type": "boolean", "default": False}, + }, + }, + "data_split_parameters": { + "type": "object", + "properties": { + "test_size": {"type": "number"}, + "random_state": {"type": "integer"}, + }, + }, + "model_training_parameters": { + "type": "object", + "properties": { + "n_estimators": {"type": "integer", "default": 2000}, + "random_state": {"type": "integer", "default": 1}, + "learning_rate": {"type": "number", "default": 0.02}, + "task_type": {"type": "string", "default": "CPU"}, + }, + }, + }, + }, }, } @@ -516,6 +554,17 @@ SCHEMA_MINIMAL_REQUIRED = [ 'dataformat_trades', ] +SCHEMA_FREQAI_REQUIRED = [ + 'timeframes', + 'train_period', + 'backtest_period', + 'identifier', + 'corr_pairlist', + 'feature_parameters', + 'data_split_parameters', + 'model_training_parameters' +] + CANCEL_REASON = { "TIMEOUT": "cancelled due to timeout", "PARTIALLY_FILLED_KEEP_OPEN": "partially filled - keeping order open", diff --git a/freqtrade/exchange/exchange.py b/freqtrade/exchange/exchange.py index 4febe5652..dc378a438 100644 --- a/freqtrade/exchange/exchange.py +++ b/freqtrade/exchange/exchange.py @@ -86,7 +86,7 @@ class Exchange: # TradingMode.SPOT always supported and not required in this list ] - def __init__(self, config: Dict[str, Any], validate: bool = True) -> None: + def __init__(self, config: Dict[str, Any], validate: bool = True, freqai: bool = False) -> None: """ Initializes this module with the given config, it does basic validation whether the specified exchange and pairs are valid. @@ -196,7 +196,7 @@ class Exchange: self.markets_refresh_interval: int = exchange_config.get( "markets_refresh_interval", 60) * 60 - if self.trading_mode != TradingMode.SPOT: + if self.trading_mode != TradingMode.SPOT and freqai is False: self.fill_leverage_tiers() self.additional_exchange_init() diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py new file mode 100644 index 000000000..3099c98a2 --- /dev/null +++ b/freqtrade/freqai/data_drawer.py @@ -0,0 +1,314 @@ +import collections +import json +import logging +import re +import shutil +import threading +from pathlib import Path +from typing import Any, Dict, Tuple + +# import pickle as pk +import numpy as np +import pandas as pd +from pandas import DataFrame + + +# from freqtrade.freqai.data_kitchen import FreqaiDataKitchen + + +logger = logging.getLogger(__name__) + + +class FreqaiDataDrawer: + """ + Class aimed at holding all pair models/info in memory for better inferencing/retrainig/saving + /loading to/from disk. + This object remains persistent throughout live/dry, unlike FreqaiDataKitchen, which is + reinstantiated for each coin. + """ + + def __init__(self, full_path: Path, config: dict, follow_mode: bool = False): + + self.config = config + self.freqai_info = config.get("freqai", {}) + # dictionary holding all pair metadata necessary to load in from disk + self.pair_dict: Dict[str, Any] = {} + # dictionary holding all actively inferenced models in memory given a model filename + self.model_dictionary: Dict[str, Any] = {} + self.model_return_values: Dict[str, Any] = {} + self.pair_data_dict: Dict[str, Any] = {} + self.historic_data: Dict[str, Any] = {} + self.follower_dict: Dict[str, Any] = {} + self.full_path = full_path + self.follow_mode = follow_mode + if follow_mode: + self.create_follower_dict() + self.load_drawer_from_disk() + self.training_queue: Dict[str, int] = {} + self.history_lock = threading.Lock() + + def load_drawer_from_disk(self): + """ + Locate and load a previously saved data drawer full of all pair model metadata in + present model folder. + :returns: + exists: bool = whether or not the drawer was located + """ + exists = Path(self.full_path / str("pair_dictionary.json")).resolve().exists() + if exists: + with open(self.full_path / str("pair_dictionary.json"), "r") as fp: + self.pair_dict = json.load(fp) + elif not self.follow_mode: + logger.info("Could not find existing datadrawer, starting from scratch") + else: + logger.warning( + f"Follower could not find pair_dictionary at {self.full_path} " + "sending null values back to strategy" + ) + + return exists + + def save_drawer_to_disk(self): + """ + Save data drawer full of all pair model metadata in present model folder. + """ + with open(self.full_path / str("pair_dictionary.json"), "w") as fp: + json.dump(self.pair_dict, fp, default=self.np_encoder) + + def save_follower_dict_to_disk(self): + """ + Save follower dictionary to disk (used by strategy for persistent prediction targets) + """ + follower_name = self.config.get("bot_name", "follower1") + with open( + self.full_path / str("follower_dictionary-" + follower_name + ".json"), "w" + ) as fp: + json.dump(self.follower_dict, fp, default=self.np_encoder) + + def create_follower_dict(self): + """ + Create or dictionary for each follower to maintain unique persistent prediction targets + """ + follower_name = self.config.get("bot_name", "follower1") + whitelist_pairs = self.config.get("exchange", {}).get("pair_whitelist") + + exists = ( + Path(self.full_path / str("follower_dictionary-" + follower_name + ".json")) + .resolve() + .exists() + ) + + if exists: + logger.info("Found an existing follower dictionary") + + for pair in whitelist_pairs: + self.follower_dict[pair] = {} + + with open( + self.full_path / str("follower_dictionary-" + follower_name + ".json"), "w" + ) as fp: + json.dump(self.follower_dict, fp, default=self.np_encoder) + + def np_encoder(self, object): + if isinstance(object, np.generic): + return object.item() + + def get_pair_dict_info(self, pair: str) -> Tuple[str, int, bool, bool]: + """ + Locate and load existing model metadata from persistent storage. If not located, + create a new one and append the current pair to it and prepare it for its first + training + :params: + metadata: dict = strategy furnished pair metadata + :returns: + model_filename: str = unique filename used for loading persistent objects from disk + trained_timestamp: int = the last time the coin was trained + coin_first: bool = If the coin is fresh without metadata + return_null_array: bool = Follower could not find pair metadata + """ + pair_in_dict = self.pair_dict.get(pair) + data_path_set = self.pair_dict.get(pair, {}).get("data_path", None) + return_null_array = False + + if pair_in_dict: + model_filename = self.pair_dict[pair]["model_filename"] + trained_timestamp = self.pair_dict[pair]["trained_timestamp"] + coin_first = self.pair_dict[pair]["first"] + elif not self.follow_mode: + self.pair_dict[pair] = {} + model_filename = self.pair_dict[pair]["model_filename"] = "" + coin_first = self.pair_dict[pair]["first"] = True + trained_timestamp = self.pair_dict[pair]["trained_timestamp"] = 0 + self.pair_dict[pair]["priority"] = len(self.pair_dict) + + if not data_path_set and self.follow_mode: + logger.warning( + f"Follower could not find current pair {pair} in " + f"pair_dictionary at path {self.full_path}, sending null values " + "back to strategy." + ) + return_null_array = True + + return model_filename, trained_timestamp, coin_first, return_null_array + + def set_pair_dict_info(self, metadata: dict) -> None: + pair_in_dict = self.pair_dict.get(metadata["pair"]) + if pair_in_dict: + return + else: + self.pair_dict[metadata["pair"]] = {} + self.pair_dict[metadata["pair"]]["model_filename"] = "" + self.pair_dict[metadata["pair"]]["first"] = True + self.pair_dict[metadata["pair"]]["trained_timestamp"] = 0 + self.pair_dict[metadata["pair"]]["priority"] = len(self.pair_dict) + return + + def pair_to_end_of_training_queue(self, pair: str) -> None: + # march all pairs up in the queue + for p in self.pair_dict: + self.pair_dict[p]["priority"] -= 1 + # send pair to end of queue + self.pair_dict[pair]["priority"] = len(self.pair_dict) + + def set_initial_return_values(self, pair: str, dk, pred_df, do_preds) -> None: + """ + Set the initial return values to a persistent dataframe. This avoids needing to repredict on + historical candles, and also stores historical predictions despite retrainings (so stored + predictions are true predictions, not just inferencing on trained data) + """ + self.model_return_values[pair] = pd.DataFrame() + for label in dk.label_list: + self.model_return_values[pair][label] = pred_df[label] + self.model_return_values[pair][f"{label}_mean"] = dk.data["labels_mean"][label] + self.model_return_values[pair][f"{label}_std"] = dk.data["labels_std"][label] + + if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0: + self.model_return_values[pair]["DI_values"] = dk.DI_values + + self.model_return_values[pair]["do_predict"] = do_preds + + def append_model_predictions(self, pair: str, predictions, do_preds, dk, len_df) -> None: + + # strat seems to feed us variable sized dataframes - and since we are trying to build our + # own return array in the same shape, we need to figure out how the size has changed + # and adapt our stored/returned info accordingly. + length_difference = len(self.model_return_values[pair]) - len_df + i = 0 + + if length_difference == 0: + i = 1 + elif length_difference > 0: + i = length_difference + 1 + + df = self.model_return_values[pair] = self.model_return_values[pair].shift(-i) + + for label in dk.label_list: + df[label].iloc[-1] = predictions[label].iloc[-1] + df[f"{label}_mean"].iloc[-1] = dk.data["labels_mean"][label] + df[f"{label}_std"].iloc[-1] = dk.data["labels_std"][label] + # df['prediction'].iloc[-1] = predictions[-1] + df["do_predict"].iloc[-1] = do_preds[-1] + + if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0: + df["DI_values"].iloc[-1] = dk.DI_values[-1] + + if length_difference < 0: + prepend_df = pd.DataFrame( + np.zeros((abs(length_difference) - 1, len(df.columns))), columns=df.columns + ) + df = pd.concat([prepend_df, df], axis=0) + + def attach_return_values_to_return_dataframe(self, pair: str, dataframe) -> DataFrame: + """ + Attach the return values to the strat dataframe + :params: + dataframe: DataFrame = strat dataframe + :returns: + dataframe: DataFrame = strat dataframe with return values attached + """ + df = self.model_return_values[pair] + to_keep = [col for col in dataframe.columns if not col.startswith("&")] + dataframe = pd.concat([dataframe[to_keep], df], axis=1) + return dataframe + + def return_null_values_to_strategy(self, dataframe: DataFrame, dk) -> None: + """ + Build 0 filled dataframe to return to strategy + """ + + dk.find_features(dataframe) + + for label in dk.label_list: + dataframe[label] = 0 + dataframe[f"{label}_mean"] = 0 + dataframe[f"{label}_std"] = 0 + + # dataframe['prediction'] = 0 + dataframe["do_predict"] = 0 + + if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0: + dataframe["DI_value"] = 0 + + dk.return_dataframe = dataframe + + def purge_old_models(self) -> None: + + model_folders = [x for x in self.full_path.iterdir() if x.is_dir()] + + pattern = re.compile(r"sub-train-(\w+)(\d{10})") + + delete_dict: Dict[str, Any] = {} + + for dir in model_folders: + result = pattern.match(str(dir.name)) + if result is None: + break + coin = result.group(1) + timestamp = result.group(2) + + if coin not in delete_dict: + delete_dict[coin] = {} + delete_dict[coin]["num_folders"] = 1 + delete_dict[coin]["timestamps"] = {int(timestamp): dir} + else: + delete_dict[coin]["num_folders"] += 1 + delete_dict[coin]["timestamps"][int(timestamp)] = dir + + for coin in delete_dict: + if delete_dict[coin]["num_folders"] > 2: + sorted_dict = collections.OrderedDict( + sorted(delete_dict[coin]["timestamps"].items()) + ) + num_delete = len(sorted_dict) - 2 + deleted = 0 + for k, v in sorted_dict.items(): + if deleted >= num_delete: + break + logger.info(f"Freqai purging old model file {v}") + shutil.rmtree(v) + deleted += 1 + + def update_follower_metadata(self): + # follower needs to load from disk to get any changes made by leader to pair_dict + self.load_drawer_from_disk() + if self.config.get("freqai", {}).get("purge_old_models", False): + self.purge_old_models() + + # to be used if we want to send predictions directly to the follower instead of forcing + # follower to load models and inference + # def save_model_return_values_to_disk(self) -> None: + # with open(self.full_path / str('model_return_values.json'), "w") as fp: + # json.dump(self.model_return_values, fp, default=self.np_encoder) + + # def load_model_return_values_from_disk(self, dk: FreqaiDataKitchen) -> FreqaiDataKitchen: + # exists = Path(self.full_path / str('model_return_values.json')).resolve().exists() + # if exists: + # with open(self.full_path / str('model_return_values.json'), "r") as fp: + # self.model_return_values = json.load(fp) + # elif not self.follow_mode: + # logger.info("Could not find existing datadrawer, starting from scratch") + # else: + # logger.warning(f'Follower could not find pair_dictionary at {self.full_path} ' + # 'sending null values back to strategy') + + # return exists, dk diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py new file mode 100644 index 000000000..1f2ee61b3 --- /dev/null +++ b/freqtrade/freqai/data_kitchen.py @@ -0,0 +1,1305 @@ +import copy +import datetime +import json +import logging +import pickle as pk +import shutil +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import numpy as np +import numpy.typing as npt +import pandas as pd +from joblib import dump, load # , Parallel, delayed # used for auto distribution assignment +from pandas import DataFrame +from sklearn import linear_model +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.model_selection import train_test_split + +from freqtrade.configuration import TimeRange +from freqtrade.data.history import load_pair_history +from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data +from freqtrade.exceptions import OperationalException +from freqtrade.freqai.data_drawer import FreqaiDataDrawer +from freqtrade.resolvers import ExchangeResolver +from freqtrade.strategy.interface import IStrategy + + +SECONDS_IN_DAY = 86400 + +logger = logging.getLogger(__name__) + + +class FreqaiDataKitchen: + """ + Class designed to analyze data for a single pair. Employed by the IFreqaiModel class. + Functionalities include holding, saving, loading, and analyzing the data. + author: Robert Caulk, rob.caulk@gmail.com + """ + + def __init__( + self, + config: Dict[str, Any], + data_drawer: FreqaiDataDrawer, + live: bool = False, + pair: str = "", + ): + self.data: Dict[Any, Any] = {} + self.data_dictionary: Dict[Any, Any] = {} + self.config = config + self.freqai_config = config["freqai"] + self.predictions: npt.ArrayLike = np.array([]) + self.do_predict: npt.ArrayLike = np.array([]) + self.target_mean: npt.ArrayLike = np.array([]) + self.target_std: npt.ArrayLike = np.array([]) + self.full_predictions: npt.ArrayLike = np.array([]) + self.full_do_predict: npt.ArrayLike = np.array([]) + self.full_DI_values: npt.ArrayLike = np.array([]) + self.full_target_mean: npt.ArrayLike = np.array([]) + self.full_target_std: npt.ArrayLike = np.array([]) + self.data_path = Path() + self.label_list: List = [] + self.model_filename: str = "" + self.live = live + self.pair = pair + self.svm_model: linear_model.SGDOneClassSVM = None + self.set_all_pairs() + if not self.live: + self.full_timerange = self.create_fulltimerange( + self.config["timerange"], self.freqai_config.get("train_period") + ) + + (self.training_timeranges, self.backtesting_timeranges) = self.split_timerange( + self.full_timerange, + config["freqai"]["train_period"], + config["freqai"]["backtest_period"], + ) + # self.strat_dataframe: DataFrame = strat_dataframe + self.dd = data_drawer + + def set_paths( + self, + pair: str, + trained_timestamp: int = None, + ) -> None: + """ + Set the paths to the data for the present coin/botloop + :params: + metadata: dict = strategy furnished pair metadata + trained_timestamp: int = timestamp of most recent training + """ + self.full_path = Path( + self.config["user_data_dir"] / "models" / str(self.freqai_config.get("identifier")) + ) + + self.data_path = Path( + self.full_path / str("sub-train" + "-" + pair.split("/")[0] + str(trained_timestamp)) + ) + + return + + def save_data(self, model: Any, coin: str = "", keras_model=False, label=None) -> None: + """ + Saves all data associated with a model for a single sub-train time range + :params: + :model: User trained model which can be reused for inferencing to generate + predictions + """ + + if not self.data_path.is_dir(): + self.data_path.mkdir(parents=True, exist_ok=True) + + save_path = Path(self.data_path) + + # Save the trained model + if not keras_model: + dump(model, save_path / f"{self.model_filename}_model.joblib") + else: + model.save(save_path / f"{self.model_filename}_model.h5") + + if self.svm_model is not None: + dump(self.svm_model, save_path / str(self.model_filename + "_svm_model.joblib")) + + self.data["data_path"] = str(self.data_path) + self.data["model_filename"] = str(self.model_filename) + self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns) + self.data["label_list"] = self.label_list + # store the metadata + with open(save_path / str(self.model_filename + "_metadata.json"), "w") as fp: + json.dump(self.data, fp, default=self.np_encoder) + + # save the train data to file so we can check preds for area of applicability later + self.data_dictionary["train_features"].to_pickle( + save_path / str(self.model_filename + "_trained_df.pkl") + ) + + if self.freqai_config.get("feature_parameters", {}).get("principal_component_analysis"): + pk.dump( + self.pca, open(self.data_path / str(self.model_filename + "_pca_object.pkl"), "wb") + ) + + # if self.live: + self.dd.model_dictionary[self.model_filename] = model + self.dd.pair_dict[coin]["model_filename"] = self.model_filename + self.dd.pair_dict[coin]["data_path"] = str(self.data_path) + self.dd.save_drawer_to_disk() + + # TODO add a helper function to let user save/load any data they are custom adding. We + # do not want them having to edit the default save/load methods here. Below is an example + # of what we do NOT want. + + # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'): + # self.data_dictionary["upper_quantiles"].to_pickle( + # save_path / str(self.model_filename + "_upper_quantiles.pkl") + # ) + + # self.data_dictionary["lower_quantiles"].to_pickle( + # save_path / str(self.model_filename + "_lower_quantiles.pkl") + # ) + + return + + def load_data(self, coin: str = "", keras_model=False) -> Any: + """ + loads all data required to make a prediction on a sub-train time range + :returns: + :model: User trained model which can be inferenced for new predictions + """ + + if not self.dd.pair_dict[coin]["model_filename"]: + return None + + if self.live: + self.model_filename = self.dd.pair_dict[coin]["model_filename"] + self.data_path = Path(self.dd.pair_dict[coin]["data_path"]) + if self.freqai_config.get("follow_mode", False): + # follower can be on a different system which is rsynced to the leader: + self.data_path = Path( + self.config["user_data_dir"] + / "models" + / self.data_path.parts[-2] + / self.data_path.parts[-1] + ) + + with open(self.data_path / str(self.model_filename + "_metadata.json"), "r") as fp: + self.data = json.load(fp) + self.training_features_list = self.data["training_features_list"] + self.label_list = self.data["label_list"] + + self.data_dictionary["train_features"] = pd.read_pickle( + self.data_path / str(self.model_filename + "_trained_df.pkl") + ) + + # TODO add a helper function to let user save/load any data they are custom adding. We + # do not want them having to edit the default save/load methods here. Below is an example + # of what we do NOT want. + + # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'): + # self.data_dictionary["upper_quantiles"] = pd.read_pickle( + # self.data_path / str(self.model_filename + "_upper_quantiles.pkl") + # ) + + # self.data_dictionary["lower_quantiles"] = pd.read_pickle( + # self.data_path / str(self.model_filename + "_lower_quantiles.pkl") + # ) + + # self.data_path = Path(self.data["data_path"]) + # self.model_filename = self.data["model_filename"] + + # try to access model in memory instead of loading object from disk to save time + if self.live and self.model_filename in self.dd.model_dictionary: + model = self.dd.model_dictionary[self.model_filename] + elif not keras_model: + model = load(self.data_path / str(self.model_filename + "_model.joblib")) + else: + from tensorflow import keras + + model = keras.models.load_model(self.data_path / str(self.model_filename + "_model.h5")) + + if Path(self.data_path / str(self.model_filename + "_svm_model.joblib")).resolve().exists(): + self.svm_model = load(self.data_path / str(self.model_filename + "_svm_model.joblib")) + + if not model: + raise OperationalException( + f"Unable to load model, ensure model exists at " f"{self.data_path} " + ) + + if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]: + self.pca = pk.load( + open(self.data_path / str(self.model_filename + "_pca_object.pkl"), "rb") + ) + + return model + + def make_train_test_datasets( + self, filtered_dataframe: DataFrame, labels: DataFrame + ) -> Dict[Any, Any]: + """ + Given the dataframe for the full history for training, split the data into + training and test data according to user specified parameters in configuration + file. + :filtered_dataframe: cleaned dataframe ready to be split. + :labels: cleaned labels ready to be split. + """ + + weights: npt.ArrayLike + if self.freqai_config["feature_parameters"].get("weight_factor", 0) > 0: + weights = self.set_weights_higher_recent(len(filtered_dataframe)) + else: + weights = np.ones(len(filtered_dataframe)) + + if self.freqai_config["feature_parameters"].get("stratify", 0) > 0: + stratification = np.zeros(len(filtered_dataframe)) + for i in range(1, len(stratification)): + if i % self.freqai_config.get("feature_parameters", {}).get("stratify", 0) == 0: + stratification[i] = 1 + else: + stratification = None + + ( + train_features, + test_features, + train_labels, + test_labels, + train_weights, + test_weights, + ) = train_test_split( + filtered_dataframe[: filtered_dataframe.shape[0]], + labels, + weights, + stratify=stratification, + # shuffle=False, + **self.config["freqai"]["data_split_parameters"], + ) + + return self.build_data_dictionary( + train_features, test_features, train_labels, test_labels, train_weights, test_weights + ) + + def filter_features( + self, + unfiltered_dataframe: DataFrame, + training_feature_list: List, + label_list: List = list(), + # labels: DataFrame = pd.DataFrame(), + training_filter: bool = True, + ) -> Tuple[DataFrame, DataFrame]: + """ + Filter the unfiltered dataframe to extract the user requested features/labels and properly + remove all NaNs. Any row with a NaN is removed from training dataset or replaced with + 0s in the prediction dataset. However, prediction dataset do_predict will reflect any + row that had a NaN and will shield user from that prediction. + :params: + :unfiltered_dataframe: the full dataframe for the present training period + :training_feature_list: list, the training feature list constructed by + self.build_feature_list() according to user specified parameters in the configuration file. + :labels: the labels for the dataset + :training_filter: boolean which lets the function know if it is training data or + prediction data to be filtered. + :returns: + :filtered_dataframe: dataframe cleaned of NaNs and only containing the user + requested feature set. + :labels: labels cleaned of NaNs. + """ + filtered_dataframe = unfiltered_dataframe.filter(training_feature_list, axis=1) + filtered_dataframe = filtered_dataframe.replace([np.inf, -np.inf], np.nan) + + drop_index = pd.isnull(filtered_dataframe).any(1) # get the rows that have NaNs, + drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement. + if ( + training_filter + ): # we don't care about total row number (total no. datapoints) in training, we only care + # about removing any row with NaNs + # if labels has multiple columns (user wants to train multiple models), we detect here + labels = unfiltered_dataframe.filter(label_list, axis=1) + drop_index_labels = pd.isnull(labels).any(1) + drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0) + filtered_dataframe = filtered_dataframe[ + (drop_index == 0) & (drop_index_labels == 0) + ] # dropping values + labels = labels[ + (drop_index == 0) & (drop_index_labels == 0) + ] # assuming the labels depend entirely on the dataframe here. + logger.info( + f"dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points" + f" due to NaNs in populated dataset {len(unfiltered_dataframe)}." + ) + if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live: + logger.warning( + f" {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100:.2f} percent" + " of training data dropped due to NaNs, model may perform inconsistent" + "with expectations" + ) + self.data["filter_drop_index_training"] = drop_index + + else: + # we are backtesting so we need to preserve row number to send back to strategy, + # so now we use do_predict to avoid any prediction based on a NaN + drop_index = pd.isnull(filtered_dataframe).any(1) + self.data["filter_drop_index_prediction"] = drop_index + filtered_dataframe.fillna(0, inplace=True) + # replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction + # that was based on a single NaN is ultimately protected from buys with do_predict + drop_index = ~drop_index + self.do_predict = np.array(drop_index.replace(True, 1).replace(False, 0)) + if (len(self.do_predict) - self.do_predict.sum()) > 0: + logger.info( + "dropped %s of %s prediction data points due to NaNs.", + len(self.do_predict) - self.do_predict.sum(), + len(filtered_dataframe), + ) + labels = [] + + return filtered_dataframe, labels + + def build_data_dictionary( + self, + train_df: DataFrame, + test_df: DataFrame, + train_labels: DataFrame, + test_labels: DataFrame, + train_weights: Any, + test_weights: Any, + ) -> Dict: + + self.data_dictionary = { + "train_features": train_df, + "test_features": test_df, + "train_labels": train_labels, + "test_labels": test_labels, + "train_weights": train_weights, + "test_weights": test_weights, + } + + return self.data_dictionary + + def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + """ + Normalize all data in the data_dictionary according to the training dataset + :params: + :data_dictionary: dictionary containing the cleaned and split training/test data/labels + :returns: + :data_dictionary: updated dictionary with standardized values. + """ + # standardize the data by training stats + train_max = data_dictionary["train_features"].max() + train_min = data_dictionary["train_features"].min() + data_dictionary["train_features"] = ( + 2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1 + ) + data_dictionary["test_features"] = ( + 2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 + ) + + train_labels_max = data_dictionary["train_labels"].max() + train_labels_min = data_dictionary["train_labels"].min() + data_dictionary["train_labels"] = ( + 2 + * (data_dictionary["train_labels"] - train_labels_min) + / (train_labels_max - train_labels_min) + - 1 + ) + data_dictionary["test_labels"] = ( + 2 + * (data_dictionary["test_labels"] - train_labels_min) + / (train_labels_max - train_labels_min) + - 1 + ) + + for item in train_max.keys(): + self.data[item + "_max"] = train_max[item] + self.data[item + "_min"] = train_min[item] + + self.data["labels_max"] = train_labels_max.to_dict() + self.data["labels_min"] = train_labels_min.to_dict() + + return data_dictionary + + def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: + """ + Normalize a set of data using the mean and standard deviation from + the associated training data. + :params: + :df: Dataframe to be standardized + """ + + for item in df.keys(): + df[item] = ( + 2 + * (df[item] - self.data[item + "_min"]) + / (self.data[item + "_max"] - self.data[item + "_min"]) + - 1 + ) + + return df + + def split_timerange( + self, tr: str, train_split: int = 28, bt_split: int = 7 + ) -> Tuple[list, list]: + """ + Function which takes a single time range (tr) and splits it + into sub timeranges to train and backtest on based on user input + tr: str, full timerange to train on + train_split: the period length for the each training (days). Specified in user + configuration file + bt_split: the backtesting length (dats). Specified in user configuration file + """ + + train_period = train_split * SECONDS_IN_DAY + bt_period = bt_split * SECONDS_IN_DAY + + full_timerange = TimeRange.parse_timerange(tr) + config_timerange = TimeRange.parse_timerange(self.config["timerange"]) + if config_timerange.stopts == 0: + config_timerange.stopts = int( + datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + ) + timerange_train = copy.deepcopy(full_timerange) + timerange_backtest = copy.deepcopy(full_timerange) + + tr_training_list = [] + tr_backtesting_list = [] + tr_training_list_timerange = [] + tr_backtesting_list_timerange = [] + first = True + # within_config_timerange = True + while True: + if not first: + timerange_train.startts = timerange_train.startts + bt_period + timerange_train.stopts = timerange_train.startts + train_period + + first = False + start = datetime.datetime.utcfromtimestamp(timerange_train.startts) + stop = datetime.datetime.utcfromtimestamp(timerange_train.stopts) + tr_training_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")) + tr_training_list_timerange.append(copy.deepcopy(timerange_train)) + + # associated backtest period + + timerange_backtest.startts = timerange_train.stopts + + timerange_backtest.stopts = timerange_backtest.startts + bt_period + + if timerange_backtest.stopts > config_timerange.stopts: + timerange_backtest.stopts = config_timerange.stopts + + start = datetime.datetime.utcfromtimestamp(timerange_backtest.startts) + stop = datetime.datetime.utcfromtimestamp(timerange_backtest.stopts) + tr_backtesting_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")) + tr_backtesting_list_timerange.append(copy.deepcopy(timerange_backtest)) + + # ensure we are predicting on exactly same amount of data as requested by user defined + # --timerange + if timerange_backtest.stopts == config_timerange.stopts: + break + + # print(tr_training_list, tr_backtesting_list) + return tr_training_list_timerange, tr_backtesting_list_timerange + + def slice_dataframe(self, timerange: TimeRange, df: DataFrame) -> DataFrame: + """ + Given a full dataframe, extract the user desired window + :params: + :tr: timerange string that we wish to extract from df + :df: Dataframe containing all candles to run the entire backtest. Here + it is sliced down to just the present training period. + """ + # timerange = TimeRange.parse_timerange(tr) + start = datetime.datetime.fromtimestamp(timerange.startts, tz=datetime.timezone.utc) + stop = datetime.datetime.fromtimestamp(timerange.stopts, tz=datetime.timezone.utc) + df = df.loc[df["date"] >= start, :] + df = df.loc[df["date"] <= stop, :] + + return df + + def principal_component_analysis(self) -> None: + """ + Performs Principal Component Analysis on the data for dimensionality reduction + and outlier detection (see self.remove_outliers()) + No parameters or returns, it acts on the data_dictionary held by the DataHandler. + """ + + from sklearn.decomposition import PCA # avoid importing if we dont need it + + n_components = self.data_dictionary["train_features"].shape[1] + pca = PCA(n_components=n_components) + pca = pca.fit(self.data_dictionary["train_features"]) + n_keep_components = np.argmin(pca.explained_variance_ratio_.cumsum() < 0.999) + pca2 = PCA(n_components=n_keep_components) + self.data["n_kept_components"] = n_keep_components + pca2 = pca2.fit(self.data_dictionary["train_features"]) + logger.info("reduced feature dimension by %s", n_components - n_keep_components) + logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_)) + train_components = pca2.transform(self.data_dictionary["train_features"]) + test_components = pca2.transform(self.data_dictionary["test_features"]) + + self.data_dictionary["train_features"] = pd.DataFrame( + data=train_components, + columns=["PC" + str(i) for i in range(0, n_keep_components)], + index=self.data_dictionary["train_features"].index, + ) + + # keeping a copy of the non-transformed features so we can check for errors during + # model load from disk + self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list) + self.training_features_list = self.data_dictionary["train_features"].columns + + self.data_dictionary["test_features"] = pd.DataFrame( + data=test_components, + columns=["PC" + str(i) for i in range(0, n_keep_components)], + index=self.data_dictionary["test_features"].index, + ) + + self.data["n_kept_components"] = n_keep_components + self.pca = pca2 + + logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}") + + if not self.data_path.is_dir(): + self.data_path.mkdir(parents=True, exist_ok=True) + + return None + + def pca_transform(self, filtered_dataframe: DataFrame) -> None: + """ + Use an existing pca transform to transform data into components + :params: + filtered_dataframe: DataFrame = the cleaned dataframe + """ + pca_components = self.pca.transform(filtered_dataframe) + self.data_dictionary["prediction_features"] = pd.DataFrame( + data=pca_components, + columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])], + index=filtered_dataframe.index, + ) + + def compute_distances(self) -> float: + """ + Compute distances between each training point and every other training + point. This metric defines the neighborhood of trained data and is used + for prediction confidence in the Dissimilarity Index + """ + logger.info("computing average mean distance for all training points") + tc = self.freqai_config.get("model_training_parameters", {}).get("thread_count", -1) + pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=tc) + avg_mean_dist = pairwise.mean(axis=1).mean() + logger.info(f"avg_mean_dist {avg_mean_dist:.2f}") + + return avg_mean_dist + + def use_SVM_to_remove_outliers(self, predict: bool) -> None: + """ + Build/inference a Support Vector Machine to detect outliers + in training data and prediction + :params: + predict: bool = If true, inference an existing SVM model, else construct one + """ + + if predict: + assert self.svm_model, "No svm model available for outlier removal" + y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"]) + do_predict = np.where(y_pred == -1, 0, y_pred) + + if (len(do_predict) - do_predict.sum()) > 0: + logger.info( + f"svm_remove_outliers() tossed {len(do_predict) - do_predict.sum()} predictions" + ) + self.do_predict += do_predict + self.do_predict -= 1 + + else: + # use SGDOneClassSVM to increase speed? + nu = self.freqai_config.get("feature_parameters", {}).get("svm_nu", 0.2) + self.svm_model = linear_model.SGDOneClassSVM(nu=nu).fit( + self.data_dictionary["train_features"] + ) + y_pred = self.svm_model.predict(self.data_dictionary["train_features"]) + dropped_points = np.where(y_pred == -1, 0, y_pred) + # keep_index = np.where(y_pred == 1) + self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ + (y_pred == 1) + ] + self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ + (y_pred == 1) + ] + self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ + (y_pred == 1) + ] + + logger.info( + f"svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}" + f" train points from {len(y_pred)}" + ) + + # same for test data + y_pred = self.svm_model.predict(self.data_dictionary["test_features"]) + dropped_points = np.where(y_pred == -1, 0, y_pred) + self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ + (y_pred == 1) + ] + self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][(y_pred == 1)] + self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ + (y_pred == 1) + ] + + logger.info( + f"svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}" + f" test points from {len(y_pred)}" + ) + + return + + def find_features(self, dataframe: DataFrame) -> None: + """ + Find features in the strategy provided dataframe + :params: + dataframe: DataFrame = strategy provided dataframe + :returns: + features: list = the features to be used for training/prediction + """ + column_names = dataframe.columns + features = [c for c in column_names if "%" in c] + labels = [c for c in column_names if "&" in c] + if not features: + raise OperationalException("Could not find any features!") + + self.training_features_list = features + self.label_list = labels + # return features, labels + + def check_if_pred_in_training_spaces(self) -> None: + """ + Compares the distance from each prediction point to each training data + point. It uses this information to estimate a Dissimilarity Index (DI) + and avoid making predictions on any points that are too far away + from the training data set. + """ + + distance = pairwise_distances( + self.data_dictionary["train_features"], + self.data_dictionary["prediction_features"], + n_jobs=-1, + ) + + self.DI_values = distance.min(axis=0) / self.data["avg_mean_dist"] + + do_predict = np.where( + self.DI_values < self.freqai_config.get("feature_parameters", {}).get("DI_threshold"), + 1, + 0, + ) + + if (len(do_predict) - do_predict.sum()) > 0: + logger.info( + f"DI tossed {len(do_predict) - do_predict.sum():.2f} predictions for " + "being too far from training data" + ) + + self.do_predict += do_predict + self.do_predict -= 1 + + def set_weights_higher_recent(self, num_weights: int) -> npt.ArrayLike: + """ + Set weights so that recent data is more heavily weighted during + training than older data. + """ + + weights = np.zeros(num_weights) + for i in range(1, len(weights)): + weights[len(weights) - i] = np.exp( + -i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights) + ) + return weights + + def append_predictions(self, predictions, do_predict, len_dataframe): + """ + Append backtest prediction from current backtest period to all previous periods + """ + + ones = np.ones(len(predictions)) + target_mean, target_std = ones * self.data["target_mean"], ones * self.data["target_std"] + + self.full_predictions = np.append(self.full_predictions, predictions) + self.full_do_predict = np.append(self.full_do_predict, do_predict) + if self.freqai_config.get("feature_parameters", {}).get("DI_threshold", 0) > 0: + self.full_DI_values = np.append(self.full_DI_values, self.DI_values) + self.full_target_mean = np.append(self.full_target_mean, target_mean) + self.full_target_std = np.append(self.full_target_std, target_std) + + return + + def fill_predictions(self, len_dataframe): + """ + Back fill values to before the backtesting range so that the dataframe matches size + when it goes back to the strategy. These rows are not included in the backtest. + """ + + filler = np.zeros(len_dataframe - len(self.full_predictions)) # startup_candle_count + self.full_predictions = np.append(filler, self.full_predictions) + self.full_do_predict = np.append(filler, self.full_do_predict) + if self.freqai_config.get("feature_parameters", {}).get("DI_threshold", 0) > 0: + self.full_DI_values = np.append(filler, self.full_DI_values) + self.full_target_mean = np.append(filler, self.full_target_mean) + self.full_target_std = np.append(filler, self.full_target_std) + + return + + def create_fulltimerange(self, backtest_tr: str, backtest_period: int) -> str: + backtest_timerange = TimeRange.parse_timerange(backtest_tr) + + if backtest_timerange.stopts == 0: + backtest_timerange.stopts = int( + datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + ) + + backtest_timerange.startts = backtest_timerange.startts - backtest_period * SECONDS_IN_DAY + start = datetime.datetime.utcfromtimestamp(backtest_timerange.startts) + stop = datetime.datetime.utcfromtimestamp(backtest_timerange.stopts) + full_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d") + + self.full_path = Path( + self.config["user_data_dir"] / "models" / str(self.freqai_config.get("identifier")) + ) + + config_path = Path(self.config["config_files"][0]) + + if not self.full_path.is_dir(): + self.full_path.mkdir(parents=True, exist_ok=True) + shutil.copy( + config_path.resolve(), + Path(self.full_path / config_path.parts[-1]), + ) + + return full_timerange + + def check_if_model_expired(self, trained_timestamp: int) -> bool: + """ + A model age checker to determine if the model is trustworthy based on user defined + `expiration_hours` in the configuration file. + :params: + trained_timestamp: int = The time of training for the most recent model. + :returns: + bool = If the model is expired or not. + """ + time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + elapsed_time = (time - trained_timestamp) / 3600 # hours + max_time = self.freqai_config.get("expiration_hours", 0) + if max_time > 0: + return elapsed_time > max_time + else: + return False + + def check_if_new_training_required( + self, trained_timestamp: int + ) -> Tuple[bool, TimeRange, TimeRange]: + + time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + trained_timerange = TimeRange() + data_load_timerange = TimeRange() + + # find the max indicator length required + max_timeframe_chars = self.freqai_config.get("timeframes")[-1] + max_period = self.freqai_config.get("feature_parameters", {}).get( + "indicator_max_period", 50 + ) + additional_seconds = 0 + if max_timeframe_chars[-1] == "d": + additional_seconds = max_period * SECONDS_IN_DAY * int(max_timeframe_chars[-2]) + elif max_timeframe_chars[-1] == "h": + additional_seconds = max_period * 3600 * int(max_timeframe_chars[-2]) + elif max_timeframe_chars[-1] == "m": + if len(max_timeframe_chars) == 2: + additional_seconds = max_period * 60 * int(max_timeframe_chars[-2]) + elif len(max_timeframe_chars) == 3: + additional_seconds = max_period * 60 * int(float(max_timeframe_chars[0:2])) + else: + logger.warning( + "FreqAI could not detect max timeframe and therefore may not " + "download the proper amount of data for training" + ) + + # logger.info(f'Extending data download by {additional_seconds/SECONDS_IN_DAY:.2f} days') + + if trained_timestamp != 0: + elapsed_time = (time - trained_timestamp) / SECONDS_IN_DAY + retrain = elapsed_time > self.freqai_config.get("backtest_period") + if retrain: + trained_timerange.startts = int( + time - self.freqai_config.get("train_period", 0) * SECONDS_IN_DAY + ) + trained_timerange.stopts = int(time) + # we want to load/populate indicators on more data than we plan to train on so + # because most of the indicators have a rolling timeperiod, and are thus NaNs + # unless they have data further back in time before the start of the train period + data_load_timerange.startts = int( + time + - self.freqai_config.get("train_period", 0) * SECONDS_IN_DAY + - additional_seconds + ) + data_load_timerange.stopts = int(time) + else: # user passed no live_trained_timerange in config + trained_timerange.startts = int( + time - self.freqai_config.get("train_period") * SECONDS_IN_DAY + ) + trained_timerange.stopts = int(time) + + data_load_timerange.startts = int( + time + - self.freqai_config.get("train_period", 0) * SECONDS_IN_DAY + - additional_seconds + ) + data_load_timerange.stopts = int(time) + retrain = True + + return retrain, trained_timerange, data_load_timerange + + def set_new_model_names(self, pair: str, trained_timerange: TimeRange): + + coin, _ = pair.split("/") + # set the new data_path + self.data_path = Path( + self.full_path + / str("sub-train" + "-" + pair.split("/")[0] + str(int(trained_timerange.stopts))) + ) + + self.model_filename = "cb_" + coin.lower() + "_" + str(int(trained_timerange.stopts)) + + # self.freqai_config['live_trained_timerange'] = str(int(trained_timerange.stopts)) + # enables persistence, but not fully implemented into save/load data yer + # self.data['live_trained_timerange'] = str(int(trained_timerange.stopts)) + + # SUPERCEDED + # def download_new_data_for_retraining(self, timerange: TimeRange, metadata: dict, + # strategy: IStrategy) -> None: + + # exchange = ExchangeResolver.load_exchange(self.config['exchange']['name'], + # self.config, validate=False, freqai=True) + # # exchange = strategy.dp._exchange # closes ccxt session + # pairs = copy.deepcopy(self.freqai_config.get('corr_pairlist', [])) + # if str(metadata['pair']) not in pairs: + # pairs.append(str(metadata['pair'])) + + # refresh_backtest_ohlcv_data( + # exchange, pairs=pairs, timeframes=self.freqai_config.get('timeframes'), + # datadir=self.config['datadir'], timerange=timerange, + # new_pairs_days=self.config['new_pairs_days'], + # erase=False, data_format=self.config.get('dataformat_ohlcv', 'json'), + # trading_mode=self.config.get('trading_mode', 'spot'), + # prepend=self.config.get('prepend_data', False) + # ) + + def download_all_data_for_training(self, timerange: TimeRange) -> None: + """ + Called only once upon start of bot to download the necessary data for + populating indicators and training the model. + :params: + timerange: TimeRange = The full data timerange for populating the indicators + and training the model. + """ + exchange = ExchangeResolver.load_exchange( + self.config["exchange"]["name"], self.config, validate=False, freqai=True + ) + + new_pairs_days = int((timerange.stopts - timerange.startts) / SECONDS_IN_DAY) + + refresh_backtest_ohlcv_data( + exchange, + pairs=self.all_pairs, + timeframes=self.freqai_config.get("timeframes"), + datadir=self.config["datadir"], + timerange=timerange, + new_pairs_days=new_pairs_days, + erase=False, + data_format=self.config.get("dataformat_ohlcv", "json"), + trading_mode=self.config.get("trading_mode", "spot"), + prepend=self.config.get("prepend_data", False), + ) + + def update_historic_data(self, strategy: IStrategy) -> None: + """ + Append new candles to our stores historic data (in memory) so that + we do not need to load candle history from disk and we dont need to + pinging exchange multiple times for the same candle. + :params: + dataframe: DataFrame = strategy provided dataframe + """ + + with self.dd.history_lock: + history_data = self.dd.historic_data + + for pair in self.all_pairs: + for tf in self.freqai_config.get("timeframes"): + + # check if newest candle is already appended + df_dp = strategy.dp.get_pair_dataframe(pair, tf) + if len(df_dp.index) == 0: + continue + if str(history_data[pair][tf].iloc[-1]["date"]) == str( + df_dp.iloc[-1:]["date"].iloc[-1] + ): + continue + + index = ( + df_dp.loc[df_dp["date"] == history_data[pair][tf].iloc[-1]["date"]].index[0] + + 1 + ) + history_data[pair][tf] = pd.concat( + [ + history_data[pair][tf], + strategy.dp.get_pair_dataframe(pair, tf).iloc[index:], + ], + ignore_index=True, + axis=0, + ) + + # logger.info(f'Length of history data {len(history_data[pair][tf])}') + + def set_all_pairs(self) -> None: + + self.all_pairs = copy.deepcopy(self.freqai_config.get("corr_pairlist", [])) + for pair in self.config.get("exchange", "").get("pair_whitelist"): + if pair not in self.all_pairs: + self.all_pairs.append(pair) + + def load_all_pair_histories(self, timerange: TimeRange) -> None: + """ + Load pair histories for all whitelist and corr_pairlist pairs. + Only called once upon startup of bot. + :params: + timerange: TimeRange = full timerange required to populate all indicators + for training according to user defined train_period + """ + history_data = self.dd.historic_data + + for pair in self.all_pairs: + if pair not in history_data: + history_data[pair] = {} + for tf in self.freqai_config.get("timeframes"): + history_data[pair][tf] = load_pair_history( + datadir=self.config["datadir"], + timeframe=tf, + pair=pair, + timerange=timerange, + data_format=self.config.get("dataformat_ohlcv", "json"), + candle_type=self.config.get("trading_mode", "spot"), + ) + + def get_base_and_corr_dataframes( + self, timerange: TimeRange, pair: str + ) -> Tuple[Dict[Any, Any], Dict[Any, Any]]: + """ + Searches through our historic_data in memory and returns the dataframes relevant + to the present pair. + :params: + timerange: TimeRange = full timerange required to populate all indicators + for training according to user defined train_period + metadata: dict = strategy furnished pair metadata + """ + + with self.dd.history_lock: + corr_dataframes: Dict[Any, Any] = {} + base_dataframes: Dict[Any, Any] = {} + historic_data = self.dd.historic_data + pairs = self.freqai_config.get("corr_pairlist", []) + + for tf in self.freqai_config.get("timeframes"): + base_dataframes[tf] = self.slice_dataframe(timerange, historic_data[pair][tf]) + if pairs: + for p in pairs: + if pair in p: + continue # dont repeat anything from whitelist + if p not in corr_dataframes: + corr_dataframes[p] = {} + corr_dataframes[p][tf] = self.slice_dataframe( + timerange, historic_data[p][tf] + ) + + return corr_dataframes, base_dataframes + + # SUPERCEDED + # def load_pairs_histories(self, timerange: TimeRange, metadata: dict) -> Tuple[Dict[Any, Any], + # DataFrame]: + # corr_dataframes: Dict[Any, Any] = {} + # base_dataframes: Dict[Any, Any] = {} + # pairs = self.freqai_config.get('corr_pairlist', []) # + [metadata['pair']] + # # timerange = TimeRange.parse_timerange(new_timerange) + + # for tf in self.freqai_config.get('timeframes'): + # base_dataframes[tf] = load_pair_history(datadir=self.config['datadir'], + # timeframe=tf, + # pair=metadata['pair'], timerange=timerange, + # data_format=self.config.get( + # 'dataformat_ohlcv', 'json'), + # candle_type=self.config.get( + # 'trading_mode', 'spot')) + # if pairs: + # for p in pairs: + # if metadata['pair'] in p: + # continue # dont repeat anything from whitelist + # if p not in corr_dataframes: + # corr_dataframes[p] = {} + # corr_dataframes[p][tf] = load_pair_history(datadir=self.config['datadir'], + # timeframe=tf, + # pair=p, timerange=timerange, + # data_format=self.config.get( + # 'dataformat_ohlcv', 'json'), + # candle_type=self.config.get( + # 'trading_mode', 'spot')) + + # return corr_dataframes, base_dataframes + + def use_strategy_to_populate_indicators( + self, strategy: IStrategy, corr_dataframes: dict, base_dataframes: dict, pair: str + ) -> DataFrame: + """ + Use the user defined strategy for populating indicators during + retrain + :params: + strategy: IStrategy = user defined strategy object + corr_dataframes: dict = dict containing the informative pair dataframes + (for user defined timeframes) + base_dataframes: dict = dict containing the current pair dataframes + (for user defined timeframes) + metadata: dict = strategy furnished pair metadata + :returns: + dataframe: DataFrame = dataframe containing populated indicators + """ + dataframe = base_dataframes[self.config["timeframe"]].copy() + pairs = self.freqai_config.get("corr_pairlist", []) + sgi = True + for tf in self.freqai_config.get("timeframes"): + dataframe = strategy.populate_any_indicators( + pair, + pair, + dataframe.copy(), + tf, + base_dataframes[tf], + coin=pair.split("/")[0] + "-", + set_generalized_indicators=sgi, + ) + sgi = False + if pairs: + for i in pairs: + if pair in i: + continue # dont repeat anything from whitelist + dataframe = strategy.populate_any_indicators( + pair, + i, + dataframe.copy(), + tf, + corr_dataframes[i][tf], + coin=i.split("/")[0] + "-", + ) + + return dataframe + + def fit_labels(self) -> None: + """ + Fit the labels with a gaussian distribution + """ + import scipy as spy + + self.data["labels_mean"], self.data["labels_std"] = {}, {} + for label in self.label_list: + f = spy.stats.norm.fit(self.data_dictionary["train_labels"][label]) + self.data["labels_mean"][label], self.data["labels_std"][label] = f[0], f[1] + + # KEEPME incase we want to let user start to grab quantiles. + # upper_q = spy.stats.norm.ppf(self.freqai_config['feature_parameters'][ + # 'target_quantile'], *f) + # lower_q = spy.stats.norm.ppf(1 - self.freqai_config['feature_parameters'][ + # 'target_quantile'], *f) + # self.data["upper_quantile"] = upper_q + # self.data["lower_quantile"] = lower_q + return + + def np_encoder(self, object): + if isinstance(object, np.generic): + return object.item() + + # Functions containing useful data manpulation examples. but not actively in use. + + # def build_feature_list(self, config: dict, metadata: dict) -> list: + # """ + # SUPERCEDED BY self.find_features() + # Build the list of features that will be used to filter + # the full dataframe. Feature list is construced from the + # user configuration file. + # :params: + # :config: Canonical freqtrade config file containing all + # user defined input in config['freqai] dictionary. + # """ + # features = [] + # for tf in config["freqai"]["timeframes"]: + # for ft in config["freqai"]["base_features"]: + # for n in range(config["freqai"]["feature_parameters"]["shift"] + 1): + # shift = "" + # if n > 0: + # shift = "_shift-" + str(n) + # features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf) + # for p in config["freqai"]["corr_pairlist"]: + # if metadata['pair'] in p: + # continue # avoid duplicate features + # features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf) + + # # logger.info("number of features %s", len(features)) + # return features + + # Possibly phasing these outlier removal methods below out in favor of + # use_SVM_to_remove_outliers (computationally more efficient and apparently higher performance). + # But these have good data manipulation examples, so keep them commented here for now. + + # def determine_statistical_distributions(self) -> None: + # from fitter import Fitter + + # logger.info('Determining best model for all features, may take some time') + + # def compute_quantiles(ft): + # f = Fitter(self.data_dictionary["train_features"][ft], + # distributions=['gamma', 'cauchy', 'laplace', + # 'beta', 'uniform', 'lognorm']) + # f.fit() + # # f.summary() + # dist = list(f.get_best().items())[0][0] + # params = f.get_best()[dist] + # upper_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.999, **params) + # lower_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.001, **params) + + # return ft, upper_q, lower_q, dist + + # quantiles_tuple = Parallel(n_jobs=-1)( + # delayed(compute_quantiles)(ft) for ft in self.data_dictionary[ + # 'train_features'].columns) + + # df = pd.DataFrame(quantiles_tuple, columns=['features', 'upper_quantiles', + # 'lower_quantiles', 'dist']) + # self.data_dictionary['upper_quantiles'] = df['upper_quantiles'] + # self.data_dictionary['lower_quantiles'] = df['lower_quantiles'] + + # return + + # def remove_outliers(self, predict: bool) -> None: + # """ + # Remove data that looks like an outlier based on the distribution of each + # variable. + # :params: + # :predict: boolean which tells the function if this is prediction data or + # training data coming in. + # """ + + # lower_quantile = self.data_dictionary["lower_quantiles"].to_numpy() + # upper_quantile = self.data_dictionary["upper_quantiles"].to_numpy() + + # if predict: + + # df = self.data_dictionary["prediction_features"][ + # (self.data_dictionary["prediction_features"] < upper_quantile) + # & (self.data_dictionary["prediction_features"] > lower_quantile) + # ] + # drop_index = pd.isnull(df).any(1) + # self.data_dictionary["prediction_features"].fillna(0, inplace=True) + # drop_index = ~drop_index + # do_predict = np.array(drop_index.replace(True, 1).replace(False, 0)) + + # logger.info( + # "remove_outliers() tossed %s predictions", + # len(do_predict) - do_predict.sum(), + # ) + # self.do_predict += do_predict + # self.do_predict -= 1 + + # else: + + # filter_train_df = self.data_dictionary["train_features"][ + # (self.data_dictionary["train_features"] < upper_quantile) + # & (self.data_dictionary["train_features"] > lower_quantile) + # ] + # drop_index = pd.isnull(filter_train_df).any(1) + # drop_index = drop_index.replace(True, 1).replace(False, 0) + # self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ + # (drop_index == 0) + # ] + # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ + # (drop_index == 0) + # ] + # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ + # (drop_index == 0) + # ] + + # logger.info( + # f'remove_outliers() tossed {drop_index.sum()}' + # f' training points from {len(filter_train_df)}' + # ) + + # # do the same for the test data + # filter_test_df = self.data_dictionary["test_features"][ + # (self.data_dictionary["test_features"] < upper_quantile) + # & (self.data_dictionary["test_features"] > lower_quantile) + # ] + # drop_index = pd.isnull(filter_test_df).any(1) + # drop_index = drop_index.replace(True, 1).replace(False, 0) + # self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][ + # (drop_index == 0) + # ] + # self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ + # (drop_index == 0) + # ] + # self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ + # (drop_index == 0) + # ] + + # logger.info( + # f'remove_outliers() tossed {drop_index.sum()}' + # f' test points from {len(filter_test_df)}' + # ) + + # return + + # def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + # """ + # standardize all data in the data_dictionary according to the training dataset + # :params: + # :data_dictionary: dictionary containing the cleaned and split training/test data/labels + # :returns: + # :data_dictionary: updated dictionary with standardized values. + # """ + # # standardize the data by training stats + # train_mean = data_dictionary["train_features"].mean() + # train_std = data_dictionary["train_features"].std() + # data_dictionary["train_features"] = ( + # data_dictionary["train_features"] - train_mean + # ) / train_std + # data_dictionary["test_features"] = ( + # data_dictionary["test_features"] - train_mean + # ) / train_std + + # train_labels_std = data_dictionary["train_labels"].std() + # train_labels_mean = data_dictionary["train_labels"].mean() + # data_dictionary["train_labels"] = ( + # data_dictionary["train_labels"] - train_labels_mean + # ) / train_labels_std + # data_dictionary["test_labels"] = ( + # data_dictionary["test_labels"] - train_labels_mean + # ) / train_labels_std + + # for item in train_std.keys(): + # self.data[item + "_std"] = train_std[item] + # self.data[item + "_mean"] = train_mean[item] + + # self.data["labels_std"] = train_labels_std + # self.data["labels_mean"] = train_labels_mean + + # return data_dictionary + + # def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame: + # """ + # Normalizes a set of data using the mean and standard deviation from + # the associated training data. + # :params: + # :df: Dataframe to be standardized + # """ + + # for item in df.keys(): + # df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"] + + # return df diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py new file mode 100644 index 000000000..b5ae78549 --- /dev/null +++ b/freqtrade/freqai/freqai_interface.py @@ -0,0 +1,559 @@ +# import contextlib +import datetime +import gc +import logging +import shutil +import threading +import time +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any, Dict, Tuple + +import numpy as np +import numpy.typing as npt +import pandas as pd +from pandas import DataFrame + +from freqtrade.configuration import TimeRange +from freqtrade.enums import RunMode +from freqtrade.exceptions import OperationalException +from freqtrade.freqai.data_drawer import FreqaiDataDrawer +from freqtrade.freqai.data_kitchen import FreqaiDataKitchen +from freqtrade.strategy.interface import IStrategy + + +pd.options.mode.chained_assignment = None +logger = logging.getLogger(__name__) + + +def threaded(fn): + def wrapper(*args, **kwargs): + threading.Thread(target=fn, args=args, kwargs=kwargs).start() + + return wrapper + + +class IFreqaiModel(ABC): + """ + Class containing all tools for training and prediction in the strategy. + User models should inherit from this class as shown in + templates/ExamplePredictionModel.py where the user overrides + train(), predict(), fit(), and make_labels(). + Author: Robert Caulk, rob.caulk@gmail.com + """ + + def __init__(self, config: Dict[str, Any]) -> None: + + self.config = config + self.assert_config(self.config) + self.freqai_info = config["freqai"] + self.data_split_parameters = config.get("freqai", {}).get("data_split_parameters") + self.model_training_parameters = config.get("freqai", {}).get("model_training_parameters") + self.feature_parameters = config.get("freqai", {}).get("feature_parameters") + self.time_last_trained = None + self.current_time = None + self.model = None + self.predictions = None + self.training_on_separate_thread = False + self.retrain = False + self.first = True + self.update_historic_data = 0 + self.set_full_path() + self.follow_mode = self.freqai_info.get("follow_mode", False) + self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode) + self.lock = threading.Lock() + self.follow_mode = self.freqai_info.get("follow_mode", False) + self.identifier = self.freqai_info.get("identifier", "no_id_provided") + self.scanning = False + self.ready_to_scan = False + self.first = True + self.keras = self.freqai_info.get("keras", False) + self.CONV_WIDTH = self.freqai_info.get("conv_width", 2) + + def assert_config(self, config: Dict[str, Any]) -> None: + + if not config.get("freqai", {}): + raise OperationalException("No freqai parameters found in configuration file.") + + def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame: + """ + Entry point to the FreqaiModel from a specific pair, it will train a new model if + necessary before making the prediction. + + :params: + :dataframe: Full dataframe coming from strategy - it contains entire + backtesting timerange + additional historical data necessary to train + the model. + :metadata: pair metadata coming from strategy. + """ + + self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE) + self.dd.set_pair_dict_info(metadata) + + if self.live: + self.dk = FreqaiDataKitchen(self.config, self.dd, self.live, metadata["pair"]) + dk = self.start_live(dataframe, metadata, strategy, self.dk) + + # For backtesting, each pair enters and then gets trained for each window along the + # sliding window defined by "train_period" (training window) and "backtest_period" + # (backtest window, i.e. window immediately following the training window). + # FreqAI slides the window and sequentially builds the backtesting results before returning + # the concatenated results for the full backtesting period back to the strategy. + elif not self.follow_mode: + self.dk = FreqaiDataKitchen(self.config, self.dd, self.live, metadata["pair"]) + logger.info(f"Training {len(self.dk.training_timeranges)} timeranges") + dk = self.start_backtesting(dataframe, metadata, self.dk) + + dataframe = self.remove_features_from_df(dk.return_dataframe) + return self.return_values(dataframe, dk) + + @threaded + def start_scanning(self, strategy: IStrategy) -> None: + """ + Function designed to constantly scan pairs for retraining on a separate thread (intracandle) + to improve model youth. This function is agnostic to data preparation/collection/storage, + it simply trains on what ever data is available in the self.dd. + :params: + strategy: IStrategy = The user defined strategy class + """ + while 1: + time.sleep(1) + for pair in self.config.get("exchange", {}).get("pair_whitelist"): + + (_, trained_timestamp, _, _) = self.dd.get_pair_dict_info(pair) + + if self.dd.pair_dict[pair]["priority"] != 1: + continue + dk = FreqaiDataKitchen(self.config, self.dd, self.live, pair) + + # file_exists = False + + dk.set_paths(pair, trained_timestamp) + # file_exists = self.model_exists(pair, + # dk, + # trained_timestamp=trained_timestamp, + # model_filename=model_filename, + # scanning=True) + + ( + retrain, + new_trained_timerange, + data_load_timerange, + ) = dk.check_if_new_training_required(trained_timestamp) + dk.set_paths(pair, new_trained_timerange.stopts) + + if retrain: # or not file_exists: + self.train_model_in_series( + new_trained_timerange, pair, strategy, dk, data_load_timerange + ) + + def start_backtesting( + self, dataframe: DataFrame, metadata: dict, dk: FreqaiDataKitchen + ) -> FreqaiDataKitchen: + """ + The main broad execution for backtesting. For backtesting, each pair enters and then gets + trained for each window along the sliding window defined by "train_period" (training window) + and "backtest_period" (backtest window, i.e. window immediately following the + training window). FreqAI slides the window and sequentially builds the backtesting results + before returning the concatenated results for the full backtesting period back to the + strategy. + :params: + dataframe: DataFrame = strategy passed dataframe + metadata: Dict = pair metadata + dk: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only + :returns: + dk: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only + """ + + # Loop enforcing the sliding window training/backtesting paradigm + # tr_train is the training time range e.g. 1 historical month + # tr_backtest is the backtesting time range e.g. the week directly + # following tr_train. Both of these windows slide through the + # entire backtest + for tr_train, tr_backtest in zip(dk.training_timeranges, dk.backtesting_timeranges): + (_, _, _, _) = self.dd.get_pair_dict_info(metadata["pair"]) + gc.collect() + dk.data = {} # clean the pair specific data between training window sliding + self.training_timerange = tr_train + # self.training_timerange_timerange = tr_train + dataframe_train = dk.slice_dataframe(tr_train, dataframe) + dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe) + + trained_timestamp = tr_train # TimeRange.parse_timerange(tr_train) + tr_train_startts_str = datetime.datetime.utcfromtimestamp(tr_train.startts).strftime( + "%Y-%m-%d %H:%M:%S" + ) + tr_train_stopts_str = datetime.datetime.utcfromtimestamp(tr_train.stopts).strftime( + "%Y-%m-%d %H:%M:%S" + ) + logger.info("Training %s", metadata["pair"]) + logger.info(f"Training {tr_train_startts_str} to {tr_train_stopts_str}") + + dk.data_path = Path( + dk.full_path + / str( + "sub-train" + + "-" + + metadata["pair"].split("/")[0] + + str(int(trained_timestamp.stopts)) + ) + ) + if not self.model_exists( + metadata["pair"], dk, trained_timestamp=trained_timestamp.stopts + ): + self.model = self.train(dataframe_train, metadata["pair"], dk) + self.dd.pair_dict[metadata["pair"]]["trained_timestamp"] = trained_timestamp.stopts + dk.set_new_model_names(metadata["pair"], trained_timestamp) + dk.save_data(self.model, metadata["pair"], keras_model=self.keras) + else: + self.model = dk.load_data(metadata["pair"], keras_model=self.keras) + + self.check_if_feature_list_matches_strategy(dataframe_train, dk) + + preds, do_preds = self.predict(dataframe_backtest, dk) + + dk.append_predictions(preds, do_preds, len(dataframe_backtest)) + print("predictions", len(dk.full_predictions), "do_predict", len(dk.full_do_predict)) + + dk.fill_predictions(len(dataframe)) + + return dk + + def start_live( + self, dataframe: DataFrame, metadata: dict, strategy: IStrategy, dk: FreqaiDataKitchen + ) -> FreqaiDataKitchen: + """ + The main broad execution for dry/live. This function will check if a retraining should be + performed, and if so, retrain and reset the model. + :params: + dataframe: DataFrame = strategy passed dataframe + metadata: Dict = pair metadata + strategy: IStrategy = currently employed strategy + dk: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only + :returns: + dk: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only + """ + + # update follower + if self.follow_mode: + self.dd.update_follower_metadata() + + # get the model metadata associated with the current pair + (_, trained_timestamp, _, return_null_array) = self.dd.get_pair_dict_info(metadata["pair"]) + + # if the metadata doesnt exist, the follower returns null arrays to strategy + if self.follow_mode and return_null_array: + logger.info("Returning null array from follower to strategy") + self.dd.return_null_values_to_strategy(dataframe, dk) + return dk + + # append the historic data once per round + if self.dd.historic_data: + dk.update_historic_data(strategy) + logger.debug(f'Updating historic data on pair {metadata["pair"]}') + + # if trainable, check if model needs training, if so compute new timerange, + # then save model and metadata. + # if not trainable, load existing data + if not self.follow_mode: + + (_, new_trained_timerange, data_load_timerange) = dk.check_if_new_training_required( + trained_timestamp + ) + dk.set_paths(metadata["pair"], new_trained_timerange.stopts) + + # download candle history if it is not already in memory + if not self.dd.historic_data: + logger.info( + "Downloading all training data for all pairs in whitelist and " + "corr_pairlist, this may take a while if you do not have the " + "data saved" + ) + dk.download_all_data_for_training(data_load_timerange) + dk.load_all_pair_histories(data_load_timerange) + + if not self.scanning: + self.scanning = True + self.start_scanning(strategy) + + elif self.follow_mode: + dk.set_paths(metadata["pair"], trained_timestamp) + logger.info( + "FreqAI instance set to follow_mode, finding existing pair" + f"using { self.identifier }" + ) + + # load the model and associated data into the data kitchen + self.model = dk.load_data(coin=metadata["pair"], keras_model=self.keras) + + if not self.model: + logger.warning("No model ready, returning null values to strategy.") + self.dd.return_null_values_to_strategy(dataframe, dk) + return dk + + # ensure user is feeding the correct indicators to the model + self.check_if_feature_list_matches_strategy(dataframe, dk) + + self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp) + + return dk + + def build_strategy_return_arrays( + self, dataframe: DataFrame, dk: FreqaiDataKitchen, pair: str, trained_timestamp: int + ) -> None: + + # hold the historical predictions in memory so we are sending back + # correct array to strategy + + if pair not in self.dd.model_return_values: + pred_df, do_preds = self.predict(dataframe, dk) + self.dd.set_initial_return_values(pair, dk, pred_df, do_preds) + dk.return_dataframe = self.dd.attach_return_values_to_return_dataframe(pair, dataframe) + return + elif self.dk.check_if_model_expired(trained_timestamp): + pred_df = DataFrame(np.zeros((2, len(dk.label_list))), columns=dk.label_list) + do_preds, dk.DI_values = np.ones(2) * 2, np.zeros(2) + logger.warning( + "Model expired, returning null values to strategy. Strategy " + "construction should take care to consider this event with " + "prediction == 0 and do_predict == 2" + ) + else: + # Only feed in the most recent candle for prediction in live scenario + pred_df, do_preds = self.predict(dataframe.iloc[-self.CONV_WIDTH:], dk, first=False) + + self.dd.append_model_predictions(pair, pred_df, do_preds, dk, len(dataframe)) + dk.return_dataframe = self.dd.attach_return_values_to_return_dataframe(pair, dataframe) + + return + + def check_if_feature_list_matches_strategy( + self, dataframe: DataFrame, dk: FreqaiDataKitchen + ) -> None: + """ + Ensure user is passing the proper feature set if they are reusing an `identifier` pointing + to a folder holding existing models. + :params: + dataframe: DataFrame = strategy provided dataframe + dk: FreqaiDataKitchen = non-persistent data container/analyzer for current coin/bot loop + """ + dk.find_features(dataframe) + if "training_features_list_raw" in dk.data: + feature_list = dk.data["training_features_list_raw"] + else: + feature_list = dk.training_features_list + if dk.training_features_list != feature_list: + raise OperationalException( + "Trying to access pretrained model with `identifier` " + "but found different features furnished by current strategy." + "Change `identifer` to train from scratch, or ensure the" + "strategy is furnishing the same features as the pretrained" + "model" + ) + + def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None: + """ + Base data cleaning method for train + Any function inside this method should drop training data points from the filtered_dataframe + based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example + of how outlier data points are dropped from the dataframe used for training. + """ + + if self.freqai_info.get("feature_parameters", {}).get("principal_component_analysis"): + dk.principal_component_analysis() + + if self.freqai_info.get("feature_parameters", {}).get("use_SVM_to_remove_outliers"): + dk.use_SVM_to_remove_outliers(predict=False) + + if self.freqai_info.get("feature_parameters", {}).get("DI_threshold"): + dk.data["avg_mean_dist"] = dk.compute_distances() + + # if self.feature_parameters["determine_statistical_distributions"]: + # dk.determine_statistical_distributions() + # if self.feature_parameters["remove_outliers"]: + # dk.remove_outliers(predict=False) + + def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: + """ + Base data cleaning method for predict. + These functions each modify dk.do_predict, which is a dataframe with equal length + to the number of candles coming from and returning to the strategy. Inside do_predict, + 1 allows prediction and < 0 signals to the strategy that the model is not confident in + the prediction. + See FreqaiDataKitchen::remove_outliers() for an example + of how the do_predict vector is modified. do_predict is ultimately passed back to strategy + for buy signals. + """ + if self.freqai_info.get("feature_parameters", {}).get("principal_component_analysis"): + dk.pca_transform(dataframe) + + if self.freqai_info.get("feature_parameters", {}).get("use_SVM_to_remove_outliers"): + dk.use_SVM_to_remove_outliers(predict=True) + + if self.freqai_info.get("feature_parameters", {}).get("DI_threshold"): + dk.check_if_pred_in_training_spaces() + + # if self.feature_parameters["determine_statistical_distributions"]: + # dk.determine_statistical_distributions() + # if self.feature_parameters["remove_outliers"]: + # dk.remove_outliers(predict=True) # creates dropped index + + def model_exists( + self, + pair: str, + dk: FreqaiDataKitchen, + trained_timestamp: int = None, + model_filename: str = "", + scanning: bool = False, + ) -> bool: + """ + Given a pair and path, check if a model already exists + :param pair: pair e.g. BTC/USD + :param path: path to model + """ + coin, _ = pair.split("/") + + if not self.live: + dk.model_filename = model_filename = "cb_" + coin.lower() + "_" + str(trained_timestamp) + + path_to_modelfile = Path(dk.data_path / str(model_filename + "_model.joblib")) + file_exists = path_to_modelfile.is_file() + if file_exists and not scanning: + logger.info("Found model at %s", dk.data_path / dk.model_filename) + elif not scanning: + logger.info("Could not find model at %s", dk.data_path / dk.model_filename) + return file_exists + + def set_full_path(self) -> None: + self.full_path = Path( + self.config["user_data_dir"] / "models" / str(self.freqai_info.get("identifier")) + ) + self.full_path.mkdir(parents=True, exist_ok=True) + shutil.copy( + self.config["config_files"][0], + Path(self.full_path, Path(self.config["config_files"][0]).name), + ) + + def remove_features_from_df(self, dataframe: DataFrame) -> DataFrame: + """ + Remove the features from the dataframe before returning it to strategy. This keeps it + compact for Frequi purposes. + """ + to_keep = [ + col for col in dataframe.columns if not col.startswith("%") or col.startswith("%%") + ] + return dataframe[to_keep] + + def train_model_in_series( + self, + new_trained_timerange: TimeRange, + pair: str, + strategy: IStrategy, + dk: FreqaiDataKitchen, + data_load_timerange: TimeRange, + ): + """ + Retreive data and train model in single threaded mode (only used if model directory is empty + upon startup for dry/live ) + :params: + new_trained_timerange: TimeRange = the timerange to train the model on + metadata: dict = strategy provided metadata + strategy: IStrategy = user defined strategy object + dk: FreqaiDataKitchen = non-persistent data container for current coin/loop + data_load_timerange: TimeRange = the amount of data to be loaded for populate_any_indicators + (larger than new_trained_timerange so that new_trained_timerange does not contain any NaNs) + """ + + corr_dataframes, base_dataframes = dk.get_base_and_corr_dataframes( + data_load_timerange, pair + ) + + unfiltered_dataframe = dk.use_strategy_to_populate_indicators( + strategy, corr_dataframes, base_dataframes, pair + ) + + unfiltered_dataframe = dk.slice_dataframe(new_trained_timerange, unfiltered_dataframe) + + # find the features indicated by strategy and store in datakitchen + dk.find_features(unfiltered_dataframe) + + model = self.train(unfiltered_dataframe, pair, dk) + + self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts + dk.set_new_model_names(pair, new_trained_timerange) + self.dd.pair_dict[pair]["first"] = False + if self.dd.pair_dict[pair]["priority"] == 1 and self.scanning: + with self.lock: + self.dd.pair_to_end_of_training_queue(pair) + dk.save_data(model, coin=pair, keras_model=self.keras) + + if self.freqai_info.get("purge_old_models", False): + self.dd.purge_old_models() + # self.retrain = False + + # Following methods which are overridden by user made prediction models. + # See freqai/prediction_models/CatboostPredictionModlel.py for an example. + + @abstractmethod + def train(self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen) -> Any: + """ + Filter the training data and train a model to it. Train makes heavy use of the datahandler + for storing, saving, loading, and analyzing the data. + :params: + :unfiltered_dataframe: Full dataframe for the current training period + :metadata: pair metadata from strategy. + :returns: + :model: Trained model which can be used to inference (self.predict) + """ + + @abstractmethod + def fit(self) -> Any: + """ + Most regressors use the same function names and arguments e.g. user + can drop in LGBMRegressor in place of CatBoostRegressor and all data + management will be properly handled by Freqai. + :params: + data_dictionary: Dict = the dictionary constructed by DataHandler to hold + all the training and test data/labels. + """ + + return + + @abstractmethod + def predict( + self, dataframe: DataFrame, dk: FreqaiDataKitchen, first: bool = True + ) -> Tuple[DataFrame, npt.ArrayLike]: + """ + Filter the prediction features data and predict with it. + :param: + unfiltered_dataframe: Full dataframe for the current backtest period. + dk: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only + :return: + :predictions: np.array of predictions + :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove + data (NaNs) or felt uncertain about data (i.e. SVM and/or DI index) + """ + + def make_labels(self, dataframe: DataFrame, dk: FreqaiDataKitchen) -> DataFrame: + """ + User defines the labels here (target values). + :params: + dataframe: DataFrame = the full dataframe for the present training period + dk: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only + """ + + return + + @abstractmethod + def return_values(self, dataframe: DataFrame, dk: FreqaiDataKitchen) -> DataFrame: + """ + User defines the dataframe to be returned to strategy here. + :params: + dataframe: DataFrame = the full dataframe for the current prediction (live) + or --timerange (backtesting) + dk: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only + :returns: + dataframe: DataFrame = dataframe filled with user defined data + """ + + return diff --git a/freqtrade/freqai/prediction_models/CatboostPredictionModel.py b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py new file mode 100644 index 000000000..7b0192fb8 --- /dev/null +++ b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py @@ -0,0 +1,154 @@ +import logging +from typing import Any, Dict, Tuple + +from catboost import CatBoostRegressor, Pool +from pandas import DataFrame + +from freqtrade.freqai.data_kitchen import FreqaiDataKitchen +from freqtrade.freqai.freqai_interface import IFreqaiModel + + +logger = logging.getLogger(__name__) + + +class CatboostPredictionModel(IFreqaiModel): + """ + User created prediction model. The class needs to override three necessary + functions, predict(), train(), fit(). The class inherits ModelHandler which + has its own DataHandler where data is held, saved, loaded, and managed. + """ + + def return_values(self, dataframe: DataFrame, dk: FreqaiDataKitchen) -> DataFrame: + """ + User uses this function to add any additional return values to the dataframe. + e.g. + dataframe['volatility'] = dk.volatility_values + """ + + return dataframe + + def make_labels(self, dataframe: DataFrame, dk: FreqaiDataKitchen) -> DataFrame: + """ + User defines the labels here (target values). + :params: + :dataframe: the full dataframe for the present training period + """ + + dataframe["s"] = ( + dataframe["close"] + .shift(-self.feature_parameters["period"]) + .rolling(self.feature_parameters["period"]) + .mean() + / dataframe["close"] + - 1 + ) + + return dataframe["s"] + + def train( + self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen + ) -> Tuple[DataFrame, DataFrame]: + """ + Filter the training data and train a model to it. Train makes heavy use of the datahkitchen + for storing, saving, loading, and analyzing the data. + :params: + :unfiltered_dataframe: Full dataframe for the current training period + :metadata: pair metadata from strategy. + :returns: + :model: Trained model which can be used to inference (self.predict) + """ + + logger.info("--------------------Starting training " f"{pair} --------------------") + + # unfiltered_labels = self.make_labels(unfiltered_dataframe, dk) + # filter the features requested by user in the configuration file and elegantly handle NaNs + features_filtered, labels_filtered = dk.filter_features( + unfiltered_dataframe, + dk.training_features_list, + dk.label_list, + training_filter=True, + ) + + # split data into train/test data. + data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + dk.fit_labels() # fit labels to a cauchy distribution so we know what to expect in strategy + # normalize all data based on train_dataset only + data_dictionary = dk.normalize_data(data_dictionary) + + # optional additional data cleaning/analysis + self.data_cleaning_train(dk) + + logger.info( + f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features" + ) + logger.info(f'Training model on {len(data_dictionary["train_features"])} data points') + + model = self.fit(data_dictionary) + + logger.info(f"--------------------done training {pair}--------------------") + + return model + + def fit(self, data_dictionary: Dict) -> Any: + """ + User sets up the training and test data to fit their desired model here + :params: + :data_dictionary: the dictionary constructed by DataHandler to hold + all the training and test data/labels. + """ + + train_data = Pool( + data=data_dictionary["train_features"], + label=data_dictionary["train_labels"], + weight=data_dictionary["train_weights"], + ) + + test_data = Pool( + data=data_dictionary["test_features"], + label=data_dictionary["test_labels"], + weight=data_dictionary["test_weights"], + ) + + model = CatBoostRegressor( + allow_writing_files=False, + verbose=100, + early_stopping_rounds=400, + **self.model_training_parameters, + ) + model.fit(X=train_data, eval_set=test_data) + + return model + + def predict( + self, unfiltered_dataframe: DataFrame, dk: FreqaiDataKitchen, first: bool = False + ) -> Tuple[DataFrame, DataFrame]: + """ + Filter the prediction features data and predict with it. + :param: unfiltered_dataframe: Full dataframe for the current backtest period. + :return: + :pred_df: dataframe containing the predictions + :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove + data (NaNs) or felt uncertain about data (PCA and DI index) + """ + + dk.find_features(unfiltered_dataframe) + filtered_dataframe, _ = dk.filter_features( + unfiltered_dataframe, dk.training_features_list, training_filter=False + ) + filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe) + dk.data_dictionary["prediction_features"] = filtered_dataframe + + # optional additional data cleaning/analysis + self.data_cleaning_predict(dk, filtered_dataframe) + + predictions = self.model.predict(dk.data_dictionary["prediction_features"]) + pred_df = DataFrame(predictions, columns=dk.label_list) + + for label in dk.label_list: + pred_df[label] = ( + (pred_df[label] + 1) + * (dk.data["labels_max"][label] - dk.data["labels_min"][label]) + / 2 + ) + dk.data["labels_min"][label] + + return (pred_df, dk.do_predict) diff --git a/freqtrade/freqai/prediction_models/CatboostPredictionMultiModel.py b/freqtrade/freqai/prediction_models/CatboostPredictionMultiModel.py new file mode 100644 index 000000000..c4d92d7bb --- /dev/null +++ b/freqtrade/freqai/prediction_models/CatboostPredictionMultiModel.py @@ -0,0 +1,133 @@ +import logging +from typing import Any, Dict, Tuple + +from catboost import CatBoostRegressor # , Pool +from pandas import DataFrame +from sklearn.multioutput import MultiOutputRegressor + +from freqtrade.freqai.data_kitchen import FreqaiDataKitchen +from freqtrade.freqai.freqai_interface import IFreqaiModel + + +logger = logging.getLogger(__name__) + + +class CatboostPredictionMultiModel(IFreqaiModel): + """ + User created prediction model. The class needs to override three necessary + functions, predict(), train(), fit(). The class inherits ModelHandler which + has its own DataHandler where data is held, saved, loaded, and managed. + """ + + def return_values(self, dataframe: DataFrame, dk: FreqaiDataKitchen) -> DataFrame: + """ + User uses this function to add any additional return values to the dataframe. + e.g. + dataframe['volatility'] = dk.volatility_values + """ + + return dataframe + + def train( + self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen + ) -> Tuple[DataFrame, DataFrame]: + """ + Filter the training data and train a model to it. Train makes heavy use of the datahkitchen + for storing, saving, loading, and analyzing the data. + :params: + :unfiltered_dataframe: Full dataframe for the current training period + :metadata: pair metadata from strategy. + :returns: + :model: Trained model which can be used to inference (self.predict) + """ + + logger.info("--------------------Starting training " f"{pair} --------------------") + + # unfiltered_labels = self.make_labels(unfiltered_dataframe, dk) + # filter the features requested by user in the configuration file and elegantly handle NaNs + features_filtered, labels_filtered = dk.filter_features( + unfiltered_dataframe, + dk.training_features_list, + dk.label_list, + training_filter=True, + ) + + # split data into train/test data. + data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + dk.fit_labels() # fit labels to a cauchy distribution so we know what to expect in strategy + # normalize all data based on train_dataset only + data_dictionary = dk.normalize_data(data_dictionary) + + # optional additional data cleaning/analysis + self.data_cleaning_train(dk) + + logger.info( + f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features" + ) + logger.info(f'Training model on {len(data_dictionary["train_features"])} data points') + + model = self.fit(data_dictionary) + + logger.info(f"--------------------done training {pair}--------------------") + + return model + + def fit(self, data_dictionary: Dict) -> Any: + """ + User sets up the training and test data to fit their desired model here + :params: + :data_dictionary: the dictionary constructed by DataHandler to hold + all the training and test data/labels. + """ + + cbr = CatBoostRegressor( + allow_writing_files=False, + gpu_ram_part=0.5, + verbose=100, + early_stopping_rounds=400, + **self.model_training_parameters, + ) + + X = data_dictionary["train_features"] + y = data_dictionary["train_labels"] + # eval_set = (data_dictionary["test_features"], data_dictionary["test_labels"]) + sample_weight = data_dictionary["train_weights"] + + model = MultiOutputRegressor(estimator=cbr) + model.fit(X=X, y=y, sample_weight=sample_weight) # , eval_set=eval_set) + + return model + + def predict( + self, unfiltered_dataframe: DataFrame, dk: FreqaiDataKitchen, first: bool = False + ) -> Tuple[DataFrame, DataFrame]: + """ + Filter the prediction features data and predict with it. + :param: unfiltered_dataframe: Full dataframe for the current backtest period. + :return: + :pred_df: dataframe containing the predictions + :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove + data (NaNs) or felt uncertain about data (PCA and DI index) + """ + + dk.find_features(unfiltered_dataframe) + filtered_dataframe, _ = dk.filter_features( + unfiltered_dataframe, dk.training_features_list, training_filter=False + ) + filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe) + dk.data_dictionary["prediction_features"] = filtered_dataframe + + # optional additional data cleaning/analysis + self.data_cleaning_predict(dk, filtered_dataframe) + + predictions = self.model.predict(dk.data_dictionary["prediction_features"]) + pred_df = DataFrame(predictions, columns=dk.label_list) + + for label in dk.label_list: + pred_df[label] = ( + (pred_df[label] + 1) + * (dk.data["labels_max"][label] - dk.data["labels_min"][label]) + / 2 + ) + dk.data["labels_min"][label] + + return (pred_df, dk.do_predict) diff --git a/freqtrade/freqai/prediction_models/LightGBMPredictionModel.py b/freqtrade/freqai/prediction_models/LightGBMPredictionModel.py new file mode 100644 index 000000000..17700a97f --- /dev/null +++ b/freqtrade/freqai/prediction_models/LightGBMPredictionModel.py @@ -0,0 +1,127 @@ +import logging +from typing import Any, Dict, Tuple + +from lightgbm import LGBMRegressor +from pandas import DataFrame + +from freqtrade.freqai.data_kitchen import FreqaiDataKitchen +from freqtrade.freqai.freqai_interface import IFreqaiModel + + +logger = logging.getLogger(__name__) + + +class LightGBMPredictionModel(IFreqaiModel): + """ + User created prediction model. The class needs to override three necessary + functions, predict(), train(), fit(). The class inherits ModelHandler which + has its own DataHandler where data is held, saved, loaded, and managed. + """ + + def return_values(self, dataframe: DataFrame, dk: FreqaiDataKitchen) -> DataFrame: + """ + User uses this function to add any additional return values to the dataframe. + e.g. + dataframe['volatility'] = dk.volatility_values + """ + + return dataframe + + def train( + self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen + ) -> Tuple[DataFrame, DataFrame]: + """ + Filter the training data and train a model to it. Train makes heavy use of the datahkitchen + for storing, saving, loading, and analyzing the data. + :params: + :unfiltered_dataframe: Full dataframe for the current training period + :metadata: pair metadata from strategy. + :returns: + :model: Trained model which can be used to inference (self.predict) + """ + + logger.info("--------------------Starting training " f"{pair} --------------------") + + # unfiltered_labels = self.make_labels(unfiltered_dataframe, dk) + # filter the features requested by user in the configuration file and elegantly handle NaNs + features_filtered, labels_filtered = dk.filter_features( + unfiltered_dataframe, + dk.training_features_list, + dk.label_list, + training_filter=True, + ) + + # split data into train/test data. + data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + dk.fit_labels() # fit labels to a cauchy distribution so we know what to expect in strategy + # normalize all data based on train_dataset only + data_dictionary = dk.normalize_data(data_dictionary) + + # optional additional data cleaning/analysis + self.data_cleaning_train(dk) + + logger.info( + f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features" + ) + logger.info(f'Training model on {len(data_dictionary["train_features"])} data points') + + model = self.fit(data_dictionary) + + logger.info(f"--------------------done training {pair}--------------------") + + return model + + def fit(self, data_dictionary: Dict) -> Any: + """ + Most regressors use the same function names and arguments e.g. user + can drop in LGBMRegressor in place of CatBoostRegressor and all data + management will be properly handled by Freqai. + :params: + :data_dictionary: the dictionary constructed by DataHandler to hold + all the training and test data/labels. + """ + + eval_set = (data_dictionary["test_features"], data_dictionary["test_labels"]) + X = data_dictionary["train_features"] + y = data_dictionary["train_labels"] + + model = LGBMRegressor(seed=42, n_estimators=2000, verbosity=1, force_col_wise=True) + model.fit(X=X, y=y, eval_set=eval_set) + + return model + + def predict( + self, unfiltered_dataframe: DataFrame, dk: FreqaiDataKitchen + ) -> Tuple[DataFrame, DataFrame]: + """ + Filter the prediction features data and predict with it. + :param: unfiltered_dataframe: Full dataframe for the current backtest period. + :return: + :predictions: np.array of predictions + :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove + data (NaNs) or felt uncertain about data (PCA and DI index) + """ + + # logger.info("--------------------Starting prediction--------------------") + + original_feature_list = dk.find_features(unfiltered_dataframe) + filtered_dataframe, _ = dk.filter_features( + unfiltered_dataframe, original_feature_list, training_filter=False + ) + filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe) + dk.data_dictionary["prediction_features"] = filtered_dataframe + + # optional additional data cleaning/analysis + self.data_cleaning_predict(dk, filtered_dataframe) + + predictions = self.model.predict(dk.data_dictionary["prediction_features"]) + pred_df = DataFrame(predictions, columns=dk.label_list) + + for label in dk.label_list: + pred_df[label] = ( + (pred_df[label] + 1) + * (dk.data["labels_max"][label] - dk.data["labels_min"][label]) + / 2 + ) + dk.data["labels_min"][label] + + return (pred_df, dk.do_predict) diff --git a/freqtrade/freqai/strategy_bridge.py b/freqtrade/freqai/strategy_bridge.py new file mode 100644 index 000000000..bb43084a0 --- /dev/null +++ b/freqtrade/freqai/strategy_bridge.py @@ -0,0 +1,12 @@ +from freqtrade.resolvers.freqaimodel_resolver import FreqaiModelResolver + + +class CustomModel: + """ + A bridge between the user defined IFreqaiModel class + and the strategy. + """ + + def __init__(self, config): + + self.bridge = FreqaiModelResolver.load_freqaimodel(config) diff --git a/freqtrade/optimize/backtesting.py b/freqtrade/optimize/backtesting.py index 030d7bdf0..37078b85a 100755 --- a/freqtrade/optimize/backtesting.py +++ b/freqtrade/optimize/backtesting.py @@ -206,6 +206,11 @@ class Backtesting: """ self.progress.init_step(BacktestState.DATALOAD, 1) + if self.config.get('freqai') is not None: + self.required_startup += int(self.config.get('freqai', {}).get('startup_candles', 1000)) + logger.info(f'Increasing startup_candle_count for freqai to {self.required_startup}') + self.config['startup_candle_count'] = self.required_startup + data = history.load_data( datadir=self.config['datadir'], pairs=self.pairlists.whitelist, diff --git a/freqtrade/plugins/pairlist/pairlist_helpers.py b/freqtrade/plugins/pairlist/pairlist_helpers.py index 1de27fcbd..23233481a 100644 --- a/freqtrade/plugins/pairlist/pairlist_helpers.py +++ b/freqtrade/plugins/pairlist/pairlist_helpers.py @@ -40,3 +40,14 @@ def expand_pairlist(wildcardpl: List[str], available_pairs: List[str], except re.error as err: raise ValueError(f"Wildcard error in {pair_wc}, {err}") return result + + +def dynamic_expand_pairlist(config: dict, markets: list) -> List[str]: + if config.get('freqai', {}): + full_pairs = config['pairs'] + [pair for pair in config['freqai']['corr_pairlist'] + if pair not in config['pairs']] + expanded_pairs = expand_pairlist(full_pairs, markets) + else: + expanded_pairs = expand_pairlist(config['pairs'], markets) + + return expanded_pairs diff --git a/freqtrade/resolvers/exchange_resolver.py b/freqtrade/resolvers/exchange_resolver.py index 4dfbf445b..c1ec8b69c 100644 --- a/freqtrade/resolvers/exchange_resolver.py +++ b/freqtrade/resolvers/exchange_resolver.py @@ -18,7 +18,8 @@ class ExchangeResolver(IResolver): object_type = Exchange @staticmethod - def load_exchange(exchange_name: str, config: dict, validate: bool = True) -> Exchange: + def load_exchange(exchange_name: str, config: dict, validate: bool = True, + freqai: bool = False) -> Exchange: """ Load the custom class from config parameter :param exchange_name: name of the Exchange to load @@ -31,7 +32,8 @@ class ExchangeResolver(IResolver): try: exchange = ExchangeResolver._load_exchange(exchange_name, kwargs={'config': config, - 'validate': validate}) + 'validate': validate, + 'freqai': freqai}) except ImportError: logger.info( f"No {exchange_name} specific subclass found. Using the generic class instead.") diff --git a/freqtrade/resolvers/freqaimodel_resolver.py b/freqtrade/resolvers/freqaimodel_resolver.py new file mode 100644 index 000000000..e666b462c --- /dev/null +++ b/freqtrade/resolvers/freqaimodel_resolver.py @@ -0,0 +1,50 @@ +# pragma pylint: disable=attribute-defined-outside-init + +""" +This module load a custom model for freqai +""" +import logging +from pathlib import Path +from typing import Dict + +from freqtrade.constants import USERPATH_FREQAIMODELS +from freqtrade.exceptions import OperationalException +from freqtrade.freqai.freqai_interface import IFreqaiModel +from freqtrade.resolvers import IResolver + + +logger = logging.getLogger(__name__) + + +class FreqaiModelResolver(IResolver): + """ + This class contains all the logic to load custom hyperopt loss class + """ + + object_type = IFreqaiModel + object_type_str = "FreqaiModel" + user_subdir = USERPATH_FREQAIMODELS + initial_search_path = Path(__file__).parent.parent.joinpath( + "freqai/prediction_models").resolve() + + @staticmethod + def load_freqaimodel(config: Dict) -> IFreqaiModel: + """ + Load the custom class from config parameter + :param config: configuration dictionary + """ + + freqaimodel_name = config.get("freqaimodel") + if not freqaimodel_name: + raise OperationalException( + "No freqaimodel set. Please use `--freqaimodel` to " + "specify the FreqaiModel class to use.\n" + ) + freqaimodel = FreqaiModelResolver.load_object( + freqaimodel_name, + config, + kwargs={"config": config}, + extra_dir=config.get("freqaimodel_path"), + ) + + return freqaimodel diff --git a/freqtrade/strategy/interface.py b/freqtrade/strategy/interface.py index d4ccfc5db..11cec1fae 100644 --- a/freqtrade/strategy/interface.py +++ b/freqtrade/strategy/interface.py @@ -546,6 +546,23 @@ class IStrategy(ABC, HyperStrategyMixin): """ return None + def populate_any_indicators(self, basepair: str, pair: str, df: DataFrame, tf: str, + informative: DataFrame = None, coin: str = "", + set_generalized_indicators: bool = False) -> DataFrame: + """ + Function designed to automatically generate, name and merge features + from user indicated timeframes in the configuration file. User can add + additional features here, but must follow the naming convention. + Defined in IStrategy because Freqai needs to know it exists. + :params: + :pair: pair to be used as informative + :df: strategy dataframe which will receive merges from informatives + :tf: timeframe of the dataframe which will modify the feature names + :informative: the dataframe associated with the informative pair + :coin: the name of the coin which will modify the feature names. + """ + return df + ### # END - Intended to be overridden by strategy ### diff --git a/freqtrade/templates/FreqaiExampleStrategy.py b/freqtrade/templates/FreqaiExampleStrategy.py new file mode 100644 index 000000000..1140ba383 --- /dev/null +++ b/freqtrade/templates/FreqaiExampleStrategy.py @@ -0,0 +1,342 @@ +import logging +from functools import reduce + +import pandas as pd +import talib.abstract as ta +from pandas import DataFrame +from technical import qtpylib + +from freqtrade.exchange import timeframe_to_prev_date +from freqtrade.freqai.strategy_bridge import CustomModel +from freqtrade.persistence import Trade +from freqtrade.strategy import DecimalParameter, IntParameter, merge_informative_pair +from freqtrade.strategy.interface import IStrategy + + +logger = logging.getLogger(__name__) + + +class FreqaiExampleStrategy(IStrategy): + """ + Example strategy showing how the user connects their own + IFreqaiModel to the strategy. Namely, the user uses: + self.model = CustomModel(self.config) + self.model.bridge.start(dataframe, metadata) + + to make predictions on their data. populate_any_indicators() automatically + generates the variety of features indicated by the user in the + canonical freqtrade configuration file under config['freqai']. + """ + + minimal_roi = {"0": 0.1, "240": -1} + + plot_config = { + "main_plot": {}, + "subplots": { + "prediction": {"prediction": {"color": "blue"}}, + "target_roi": { + "target_roi": {"color": "brown"}, + }, + "do_predict": { + "do_predict": {"color": "brown"}, + }, + }, + } + + process_only_new_candles = True + stoploss = -0.05 + use_exit_signal = True + startup_candle_count: int = 300 + can_short = False + + linear_roi_offset = DecimalParameter( + 0.00, 0.02, default=0.005, space="sell", optimize=False, load=True + ) + max_roi_time_long = IntParameter(0, 800, default=400, space="sell", optimize=False, load=True) + + def informative_pairs(self): + whitelist_pairs = self.dp.current_whitelist() + corr_pairs = self.config["freqai"]["corr_pairlist"] + informative_pairs = [] + for tf in self.config["freqai"]["timeframes"]: + for pair in whitelist_pairs: + informative_pairs.append((pair, tf)) + for pair in corr_pairs: + if pair in whitelist_pairs: + continue # avoid duplication + informative_pairs.append((pair, tf)) + return informative_pairs + + def bot_start(self): + self.model = CustomModel(self.config) + + def populate_any_indicators( + self, metadata, pair, df, tf, informative=None, coin="", set_generalized_indicators=False + ): + """ + Function designed to automatically generate, name and merge features + from user indicated timeframes in the configuration file. User controls the indicators + passed to the training/prediction by prepending indicators with `'%-' + coin ` + (see convention below). I.e. user should not prepend any supporting metrics + (e.g. bb_lowerband below) with % unless they explicitly want to pass that metric to the + model. + :params: + :pair: pair to be used as informative + :df: strategy dataframe which will receive merges from informatives + :tf: timeframe of the dataframe which will modify the feature names + :informative: the dataframe associated with the informative pair + :coin: the name of the coin which will modify the feature names. + """ + + with self.model.bridge.lock: + if informative is None: + informative = self.dp.get_pair_dataframe(pair, tf) + + # first loop is automatically duplicating indicators for time periods + for t in self.freqai_info["feature_parameters"]["indicator_periods"]: + + t = int(t) + informative[f"%-{coin}rsi-period_{t}"] = ta.RSI(informative, timeperiod=t) + informative[f"%-{coin}mfi-period_{t}"] = ta.MFI(informative, timeperiod=t) + informative[f"%-{coin}adx-period_{t}"] = ta.ADX(informative, window=t) + informative[f"{coin}20sma-period_{t}"] = ta.SMA(informative, timeperiod=t) + informative[f"{coin}21ema-period_{t}"] = ta.EMA(informative, timeperiod=t) + informative[f"%-{coin}close_over_20sma-period_{t}"] = ( + informative["close"] / informative[f"{coin}20sma-period_{t}"] + ) + + informative[f"%-{coin}mfi-period_{t}"] = ta.MFI(informative, timeperiod=t) + + bollinger = qtpylib.bollinger_bands( + qtpylib.typical_price(informative), window=t, stds=2.2 + ) + informative[f"{coin}bb_lowerband-period_{t}"] = bollinger["lower"] + informative[f"{coin}bb_middleband-period_{t}"] = bollinger["mid"] + informative[f"{coin}bb_upperband-period_{t}"] = bollinger["upper"] + + informative[f"%-{coin}bb_width-period_{t}"] = ( + informative[f"{coin}bb_upperband-period_{t}"] + - informative[f"{coin}bb_lowerband-period_{t}"] + ) / informative[f"{coin}bb_middleband-period_{t}"] + informative[f"%-{coin}close-bb_lower-period_{t}"] = ( + informative["close"] / informative[f"{coin}bb_lowerband-period_{t}"] + ) + + informative[f"%-{coin}roc-period_{t}"] = ta.ROC(informative, timeperiod=t) + macd = ta.MACD(informative, timeperiod=t) + informative[f"%-{coin}macd-period_{t}"] = macd["macd"] + + informative[f"%-{coin}relative_volume-period_{t}"] = ( + informative["volume"] / informative["volume"].rolling(t).mean() + ) + + informative[f"%-{coin}pct-change"] = informative["close"].pct_change() + informative[f"%-{coin}raw_volume"] = informative["volume"] + informative[f"%-{coin}raw_price"] = informative["close"] + + indicators = [col for col in informative if col.startswith("%")] + # This loop duplicates and shifts all indicators to add a sense of recency to data + for n in range(self.freqai_info["feature_parameters"]["shift"] + 1): + if n == 0: + continue + informative_shift = informative[indicators].shift(n) + informative_shift = informative_shift.add_suffix("_shift-" + str(n)) + informative = pd.concat((informative, informative_shift), axis=1) + + df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True) + skip_columns = [ + (s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"] + ] + df = df.drop(columns=skip_columns) + + # Add generalized indicators here (because in live, it will call this + # function to populate indicators during training). Notice how we ensure not to + # add them multiple times + if set_generalized_indicators: + df["%-day_of_week"] = (df["date"].dt.dayofweek + 1) / 7 + df["%-hour_of_day"] = (df["date"].dt.hour + 1) / 25 + + # user adds targets here by prepending them with &- (see convention below) + # If user wishes to use multiple targets, a multioutput prediction model + # needs to be used such as templates/CatboostPredictionMultiModel.py + df["&-s_close"] = ( + df["close"] + .shift(-self.freqai_info["feature_parameters"]["period"]) + .rolling(self.freqai_info["feature_parameters"]["period"]) + .mean() + / df["close"] + - 1 + ) + + return df + + def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame: + + self.freqai_info = self.config["freqai"] + self.pair = metadata["pair"] + sgi = True + # the following loops are necessary for building the features + # indicated by the user in the configuration file. + # All indicators must be populated by populate_any_indicators() for live functionality + # to work correctly. + for tf in self.freqai_info["timeframes"]: + dataframe = self.populate_any_indicators( + metadata, + self.pair, + dataframe.copy(), + tf, + coin=self.pair.split("/")[0] + "-", + set_generalized_indicators=sgi, + ) + sgi = False + for pair in self.freqai_info["corr_pairlist"]: + if metadata["pair"] in pair: + continue # do not include whitelisted pair twice if it is in corr_pairlist + dataframe = self.populate_any_indicators( + metadata, pair, dataframe.copy(), tf, coin=pair.split("/")[0] + "-" + ) + + # the model will return 4 values, its prediction, an indication of whether or not the + # prediction should be accepted, the target mean/std values from the labels used during + # each training period. + dataframe = self.model.bridge.start(dataframe, metadata, self) + + dataframe["target_roi"] = dataframe["&-s_close_mean"] + dataframe["&-s_close_std"] * 1.25 + dataframe["sell_roi"] = dataframe["&-s_close_mean"] - dataframe["&-s_close_std"] * 1.25 + return dataframe + + def populate_entry_trend(self, df: DataFrame, metadata: dict) -> DataFrame: + + enter_long_conditions = [df["do_predict"] == 1, df["&-s_close"] > df["target_roi"]] + + if enter_long_conditions: + df.loc[ + reduce(lambda x, y: x & y, enter_long_conditions), ["enter_long", "enter_tag"] + ] = (1, "long") + + enter_short_conditions = [df["do_predict"] == 1, df["&-s_close"] < df["sell_roi"]] + + if enter_short_conditions: + df.loc[ + reduce(lambda x, y: x & y, enter_short_conditions), ["enter_short", "enter_tag"] + ] = (1, "short") + + return df + + def populate_exit_trend(self, df: DataFrame, metadata: dict) -> DataFrame: + exit_long_conditions = [df["do_predict"] == 1, df["&-s_close"] < df["sell_roi"] * 0.25] + if exit_long_conditions: + df.loc[reduce(lambda x, y: x & y, exit_long_conditions), "exit_long"] = 1 + + exit_short_conditions = [df["do_predict"] == 1, df["&-s_close"] > df["target_roi"] * 0.25] + if exit_short_conditions: + df.loc[reduce(lambda x, y: x & y, exit_short_conditions), "exit_short"] = 1 + + return df + + def get_ticker_indicator(self): + return int(self.config["timeframe"][:-1]) + + def custom_exit( + self, pair: str, trade: Trade, current_time, current_rate, current_profit, **kwargs + ): + + dataframe, _ = self.dp.get_analyzed_dataframe(pair=pair, timeframe=self.timeframe) + + trade_date = timeframe_to_prev_date(self.config["timeframe"], trade.open_date_utc) + trade_candle = dataframe.loc[(dataframe["date"] == trade_date)] + + if trade_candle.empty: + return None + trade_candle = trade_candle.squeeze() + + follow_mode = self.config.get("freqai", {}).get("follow_mode", False) + + if not follow_mode: + pair_dict = self.model.bridge.dd.pair_dict + else: + pair_dict = self.model.bridge.dd.follower_dict + + entry_tag = trade.enter_tag + + if ( + "prediction" + entry_tag not in pair_dict[pair] + or pair_dict[pair]["prediction" + entry_tag] > 0 + ): + with self.model.bridge.lock: + pair_dict[pair]["prediction" + entry_tag] = abs(trade_candle["&-s_close"]) + if not follow_mode: + self.model.bridge.dd.save_drawer_to_disk() + else: + self.model.bridge.dd.save_follower_dict_to_disk() + + roi_price = pair_dict[pair]["prediction" + entry_tag] + roi_time = self.max_roi_time_long.value + + roi_decay = roi_price * ( + 1 - ((current_time - trade.open_date_utc).seconds) / (roi_time * 60) + ) + if roi_decay < 0: + roi_decay = self.linear_roi_offset.value + else: + roi_decay += self.linear_roi_offset.value + + if current_profit > roi_decay: + return "roi_custom_win" + + if current_profit < -roi_decay: + return "roi_custom_loss" + + def confirm_trade_exit( + self, + pair: str, + trade: Trade, + order_type: str, + amount: float, + rate: float, + time_in_force: str, + exit_reason: str, + current_time, + **kwargs, + ) -> bool: + + entry_tag = trade.enter_tag + follow_mode = self.config.get("freqai", {}).get("follow_mode", False) + if not follow_mode: + pair_dict = self.model.bridge.dd.pair_dict + else: + pair_dict = self.model.bridge.dd.follower_dict + + with self.model.bridge.lock: + pair_dict[pair]["prediction" + entry_tag] = 0 + if not follow_mode: + self.model.bridge.dd.save_drawer_to_disk() + else: + self.model.bridge.dd.save_follower_dict_to_disk() + + return True + + def confirm_trade_entry( + self, + pair: str, + order_type: str, + amount: float, + rate: float, + time_in_force: str, + current_time, + entry_tag, + side: str, + **kwargs, + ) -> bool: + + df, _ = self.dp.get_analyzed_dataframe(pair, self.timeframe) + last_candle = df.iloc[-1].squeeze() + + if side == "long": + if rate > (last_candle["close"] * (1 + 0.0025)): + return False + else: + if rate < (last_candle["close"] * (1 - 0.0025)): + return False + + return True diff --git a/mkdocs.yml b/mkdocs.yml index a43322f78..18744e0d5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -35,6 +35,7 @@ nav: - Edge Positioning: edge.md - Advanced Strategy: strategy-advanced.md - Advanced Hyperopt: advanced-hyperopt.md + - Freqai: freqai.md - Sandbox Testing: sandbox-testing.md - FAQ: faq.md - SQL Cheat-sheet: sql_cheatsheet.md diff --git a/requirements-freqai.txt b/requirements-freqai.txt new file mode 100644 index 000000000..a06a41b96 --- /dev/null +++ b/requirements-freqai.txt @@ -0,0 +1,9 @@ +# Include all requirements to run the bot. +-r requirements.txt + +# Required for freqai +scikit-learn==1.0.2 +scikit-optimize==0.9.0 +joblib==1.1.0 +catboost==1.0.4 +lightgbm==3.3.2 diff --git a/user_data/freqaimodels/.gitkeep b/user_data/freqaimodels/.gitkeep new file mode 100644 index 000000000..e69de29bb