freqtrade_origin/freqtrade/freqai/utils.py

import logging
from datetime import datetime, timezone
# for plot_feature_importance
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

from freqtrade.configuration import TimeRange
from freqtrade.data.dataprovider import DataProvider
from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.exchange.exchange import market_is_active
from freqtrade.plugins.pairlist.pairlist_helpers import dynamic_expand_pairlist


logger = logging.getLogger(__name__)


def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
    """
    Called only once upon start of bot to download the necessary data for
    populating indicators and training the model.
    :param timerange: TimeRange = The full data timerange for populating the indicators
                                    and training the model.
    :param dp: DataProvider instance attached to the strategy
    """

    if dp._exchange is None:
        raise OperationalException('No exchange object found.')
    markets = [p for p, m in dp._exchange.markets.items() if market_is_active(m)
               or config.get('include_inactive')]

    all_pairs = dynamic_expand_pairlist(config, markets)

    timerange = get_required_data_timerange(config)

    new_pairs_days = int((timerange.stopts - timerange.startts) / 86400)

    refresh_backtest_ohlcv_data(
        dp._exchange,
        pairs=all_pairs,
        timeframes=config["freqai"]["feature_parameters"].get("include_timeframes"),
        datadir=config["datadir"],
        timerange=timerange,
        new_pairs_days=new_pairs_days,
        erase=False,
        data_format=config.get("dataformat_ohlcv", "json"),
        trading_mode=config.get("trading_mode", "spot"),
        prepend=config.get("prepend_data", False),
    )


def get_required_data_timerange(
    config: dict
) -> TimeRange:
    """
    Used to compute the required data download time range
    for auto data-download in FreqAI
    """
    time = datetime.now(tz=timezone.utc).timestamp()

    timeframes = config["freqai"]["feature_parameters"].get("include_timeframes")

    max_tf_seconds = 0
    for tf in timeframes:
        secs = timeframe_to_seconds(tf)
        if secs > max_tf_seconds:
            max_tf_seconds = secs

    startup_candles = config.get('startup_candle_count', 0)
    indicator_periods = config["freqai"]["feature_parameters"]["indicator_periods_candles"]

    # factor the max_period as a factor of safety.
    max_period = int(max(startup_candles, max(indicator_periods)) * 1.5)
    config['startup_candle_count'] = max_period
    logger.info(f'FreqAI auto-downloader using {max_period} startup candles.')

    additional_seconds = max_period * max_tf_seconds

    startts = int(
        time
        - config["freqai"].get("train_period_days", 0) * 86400
        - additional_seconds
    )
    stopts = int(time)
    data_load_timerange = TimeRange('date', 'date', startts, stopts)

    return data_load_timerange


# Keep below for when we wish to download heterogeneously lengthed data for FreqAI.
# def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
#     """
#     Called only once upon start of bot to download the necessary data for
#     populating indicators and training a FreqAI model.
#     :param timerange: TimeRange = The full data timerange for populating the indicators
#                                     and training the model.
#     :param dp: DataProvider instance attached to the strategy
#     """

#     if dp._exchange is not None:
#         markets = [p for p, m in dp._exchange.markets.items() if market_is_active(m)
#                    or config.get('include_inactive')]
#     else:
#         # This should not occur:
#         raise OperationalException('No exchange object found.')

#     all_pairs = dynamic_expand_pairlist(config, markets)

#     if not dp._exchange:
#         # Not realistic - this is only called in live mode.
#         raise OperationalException("Dataprovider did not have an exchange attached.")

#     time = datetime.now(tz=timezone.utc).timestamp()

#     for tf in config["freqai"]["feature_parameters"].get("include_timeframes"):
#         timerange = TimeRange()
#         timerange.startts = int(time)
#         timerange.stopts = int(time)
#         startup_candles = dp.get_required_startup(str(tf))
#         tf_seconds = timeframe_to_seconds(str(tf))
#         timerange.subtract_start(tf_seconds * startup_candles)
#         new_pairs_days = int((timerange.stopts - timerange.startts) / 86400)
#         # FIXME: now that we are looping on `refresh_backtest_ohlcv_data`, the function
#         # redownloads the funding rate for each pair.
#         refresh_backtest_ohlcv_data(
#             dp._exchange,
#             pairs=all_pairs,
#             timeframes=[tf],
#             datadir=config["datadir"],
#             timerange=timerange,
#             new_pairs_days=new_pairs_days,
#             erase=False,
#             data_format=config.get("dataformat_ohlcv", "json"),
#             trading_mode=config.get("trading_mode", "spot"),
#             prepend=config.get("prepend_data", False),
#         )


def plot_feature_importance(model, feature_names, pair, train_dir, count_max=25) -> None:
    """
        Plot Best and Worst Features by importance for CatBoost model.
        Called once per sub-train.

        Required: pip install kaleido

        Usage: plot_feature_importance(
            model=model,
            feature_names=dk.training_features_list,
            pair=pair,
            train_dir=dk.data_path)
    """

    # Gather feature importance from model
    if "catboost.core" in str(model.__class__):
        fi = model.get_feature_importance()

    elif "lightgbm.sklearn" in str(model.__class__):
        fi = model.feature_importances_

    else:
        raise NotImplementedError(f"Cannot extract feature importance for {model.__class__}")

    # Data preparation
    fi_df = pd.DataFrame({
        "feature_names": np.array(feature_names),
        "feature_importance": np.array(fi)
    })
    fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]
    fi_df_worst = fi_df.nsmallest(count_max, "feature_importance")[::-1]

    # Plotting
    fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.5)
    fig.add_trace(
        go.Bar(
            x=fi_df_top["feature_importance"],
            y=fi_df_top["feature_names"],
            orientation='h', showlegend=False
        ), row=1, col=1
    )
    fig.add_trace(
        go.Bar(
            x=fi_df_worst["feature_importance"],
            y=fi_df_worst["feature_names"],
            orientation='h', showlegend=False
        ), row=1, col=2
    )
    fig.update_layout(
        title_text=f"Best and Worst Features {pair}",
        width=1000, height=600
    )

    # Create directory and save image
    model_dir, train_name = str(train_dir).rsplit("/", 1)
    fi_dir = Path(f"{model_dir}/feature_importance/{pair.split('/')[0]}")
    fi_dir.mkdir(parents=True, exist_ok=True)

    pio.write_image(fig, f"{fi_dir}/{train_name}.png", format="png")

    logger.info(f"Freqai saving feature importance plot {fi_dir}/{train_name}.png")