freqtrade_origin/freqtrade/data/converter/converter.py

303 lines
10 KiB
Python
Raw Normal View History

2017-11-18 07:34:32 +00:00
"""
2018-12-12 18:57:25 +00:00
Functions to convert data from one format to another
2017-11-18 07:34:32 +00:00
"""
2024-05-12 15:41:55 +00:00
2018-03-25 19:37:14 +00:00
import logging
from typing import Dict
import numpy as np
2018-08-05 04:41:06 +00:00
import pandas as pd
2018-03-02 15:22:00 +00:00
from pandas import DataFrame, to_datetime
2018-03-17 21:44:47 +00:00
from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS, Config
2023-07-09 13:28:05 +00:00
from freqtrade.enums import CandleType, TradingMode
2020-09-28 17:39:41 +00:00
2018-12-30 15:07:47 +00:00
2018-03-25 19:37:14 +00:00
logger = logging.getLogger(__name__)
2024-05-12 15:41:55 +00:00
def ohlcv_to_dataframe(
ohlcv: list,
timeframe: str,
pair: str,
*,
fill_missing: bool = True,
drop_incomplete: bool = True,
) -> DataFrame:
"""
Converts a list with candle (OHLCV) data (in format returned by ccxt.fetch_ohlcv)
to a Dataframe
:param ohlcv: list with candle (OHLCV) data, as returned by exchange.async_get_candle_history
:param timeframe: timeframe (e.g. 5m). Used to fill up eventual missing data
:param pair: Pair this data is for (used to warn if fillup was necessary)
2018-12-31 18:42:14 +00:00
:param fill_missing: fill up missing candles with 0 candles
(see ohlcv_fill_up_missing_data for details)
2019-06-09 12:35:58 +00:00
:param drop_incomplete: Drop the last candle of the dataframe, assuming it's incomplete
:return: DataFrame
"""
logger.debug(f"Converting candle (OHLCV) data to dataframe for pair {pair}.")
cols = DEFAULT_DATAFRAME_COLUMNS
df = DataFrame(ohlcv, columns=cols)
2024-05-12 15:41:55 +00:00
df["date"] = to_datetime(df["date"], unit="ms", utc=True)
# Some exchanges return int values for Volume and even for OHLC.
2019-02-10 19:23:00 +00:00
# Convert them since TA-LIB indicators used in the strategy assume floats
2019-02-10 19:13:40 +00:00
# and fail with exception...
2024-05-12 15:41:55 +00:00
df = df.astype(
dtype={
"open": "float",
"high": "float",
"low": "float",
"close": "float",
"volume": "float",
}
)
return clean_ohlcv_dataframe(
df, timeframe, pair, fill_missing=fill_missing, drop_incomplete=drop_incomplete
)
def clean_ohlcv_dataframe(
data: DataFrame, timeframe: str, pair: str, *, fill_missing: bool, drop_incomplete: bool
) -> DataFrame:
"""
2021-06-25 13:45:49 +00:00
Cleanse a OHLCV dataframe by
* Grouping it by date (removes duplicate tics)
* dropping last candles if requested
* Filling up missing data (if requested)
:param data: DataFrame containing candle (OHLCV) data.
:param timeframe: timeframe (e.g. 5m). Used to fill up eventual missing data
:param pair: Pair this data is for (used to warn if fillup was necessary)
:param fill_missing: fill up missing candles with 0 candles
(see ohlcv_fill_up_missing_data for details)
:param drop_incomplete: Drop the last candle of the dataframe, assuming it's incomplete
:return: DataFrame
"""
# group by index and aggregate results to eliminate duplicate ticks
2024-05-12 15:41:55 +00:00
data = data.groupby(by="date", as_index=False, sort=True).agg(
{
"open": "first",
"high": "max",
"low": "min",
"close": "last",
"volume": "max",
}
)
2019-06-09 12:35:58 +00:00
# eliminate partial candle
if drop_incomplete:
data.drop(data.tail(1).index, inplace=True)
2024-05-12 15:41:55 +00:00
logger.debug("Dropping last candle")
if fill_missing:
return ohlcv_fill_up_missing_data(data, timeframe, pair)
else:
return data
2018-08-05 04:41:06 +00:00
def ohlcv_fill_up_missing_data(dataframe: DataFrame, timeframe: str, pair: str) -> DataFrame:
2018-12-30 15:07:47 +00:00
"""
Fills up missing data with 0 volume rows,
2024-05-12 15:51:21 +00:00
using the previous close as price for "open", "high", "low" and "close", volume is set to 0
2018-12-30 15:07:47 +00:00
"""
from freqtrade.exchange import timeframe_to_resample_freq
2024-05-12 15:41:55 +00:00
ohlcv_dict = {"open": "first", "high": "max", "low": "min", "close": "last", "volume": "sum"}
resample_interval = timeframe_to_resample_freq(timeframe)
2018-12-30 15:07:47 +00:00
# Resample to create "NAN" values
2024-05-12 15:41:55 +00:00
df = dataframe.resample(resample_interval, on="date").agg(ohlcv_dict)
2018-12-30 15:07:47 +00:00
# Forwardfill close for missing columns
2024-05-12 15:41:55 +00:00
df["close"] = df["close"].ffill()
2018-12-30 15:07:47 +00:00
# Use close for "open, high, low"
2024-05-12 15:41:55 +00:00
df.loc[:, ["open", "high", "low"]] = df[["open", "high", "low"]].fillna(
value={
"open": df["close"],
"high": df["close"],
"low": df["close"],
}
)
2018-12-30 15:07:47 +00:00
df.reset_index(inplace=True)
2019-06-15 11:31:14 +00:00
len_before = len(dataframe)
len_after = len(df)
pct_missing = (len_after - len_before) / len_before if len_before > 0 else 0
2019-06-15 11:31:14 +00:00
if len_before != len_after:
2024-05-12 15:41:55 +00:00
message = (
f"Missing data fillup for {pair}, {timeframe}: "
f"before: {len_before} - after: {len_after} - {pct_missing:.2%}"
)
if pct_missing > 0.01:
logger.info(message)
else:
# Don't be verbose if only a small amount is missing
logger.debug(message)
2018-12-30 15:07:47 +00:00
return df
2024-05-12 15:41:55 +00:00
def trim_dataframe(
df: DataFrame, timerange, *, df_date_col: str = "date", startup_candles: int = 0
) -> DataFrame:
"""
Trim dataframe based on given timerange
:param df: Dataframe to trim
:param timerange: timerange (use start and end date if available)
:param df_date_col: Column in the dataframe to use as Date column
:param startup_candles: When not 0, is used instead the timerange start date
:return: trimmed dataframe
"""
if startup_candles:
# Trim candles instead of timeframe in case of given startup_candle count
df = df.iloc[startup_candles:, :]
else:
2024-05-12 15:41:55 +00:00
if timerange.starttype == "date":
df = df.loc[df[df_date_col] >= timerange.startdt, :]
2024-05-12 15:41:55 +00:00
if timerange.stoptype == "date":
df = df.loc[df[df_date_col] <= timerange.stopdt, :]
return df
2024-05-12 15:41:55 +00:00
def trim_dataframes(
preprocessed: Dict[str, DataFrame], timerange, startup_candles: int
) -> Dict[str, DataFrame]:
"""
Trim startup period from analyzed dataframes
:param preprocessed: Dict of pair: dataframe
:param timerange: timerange (use start and end date if available)
:param startup_candles: Startup-candles that should be removed
:return: Dict of trimmed dataframes
"""
processed: Dict[str, DataFrame] = {}
for pair, df in preprocessed.items():
trimed_df = trim_dataframe(df, timerange, startup_candles=startup_candles)
if not trimed_df.empty:
processed[pair] = trimed_df
else:
2024-05-12 15:41:55 +00:00
logger.warning(
2024-05-12 15:51:21 +00:00
f"{pair} has no data left after adjusting for startup candles, skipping."
2024-05-12 15:41:55 +00:00
)
return processed
2018-08-05 13:08:07 +00:00
def order_book_to_dataframe(bids: list, asks: list) -> DataFrame:
2018-08-05 04:41:06 +00:00
"""
2019-12-28 09:54:10 +00:00
TODO: This should get a dedicated test
2018-08-05 04:41:06 +00:00
Gets order book list, returns dataframe with below format per suggested by creslin
-------------------------------------------------------------------
b_sum b_size bids asks a_size a_sum
-------------------------------------------------------------------
"""
2024-05-12 15:41:55 +00:00
cols = ["bids", "b_size"]
2018-08-05 13:08:07 +00:00
bids_frame = DataFrame(bids, columns=cols)
2018-08-05 04:41:06 +00:00
# add cumulative sum column
2024-05-12 15:41:55 +00:00
bids_frame["b_sum"] = bids_frame["b_size"].cumsum()
cols2 = ["asks", "a_size"]
2018-08-05 13:08:07 +00:00
asks_frame = DataFrame(asks, columns=cols2)
2018-08-05 04:41:06 +00:00
# add cumulative sum column
2024-05-12 15:41:55 +00:00
asks_frame["a_sum"] = asks_frame["a_size"].cumsum()
frame = pd.concat(
[
bids_frame["b_sum"],
bids_frame["b_size"],
bids_frame["bids"],
asks_frame["asks"],
asks_frame["a_size"],
asks_frame["a_sum"],
],
axis=1,
keys=["b_sum", "b_size", "bids", "asks", "a_size", "a_sum"],
)
2018-08-05 04:41:06 +00:00
# logger.info('order book %s', frame )
return frame
2019-10-13 17:21:27 +00:00
def convert_ohlcv_format(
2022-09-18 11:20:36 +00:00
config: Config,
convert_from: str,
convert_to: str,
erase: bool,
):
"""
Convert OHLCV from one format to another
:param config: Config dictionary
:param convert_from: Source format
:param convert_to: Target format
2021-08-16 12:16:24 +00:00
:param erase: Erase source data (does not apply if source and target format are identical)
"""
from freqtrade.data.history import get_datahandler
2024-05-12 15:41:55 +00:00
src = get_datahandler(config["datadir"], convert_from)
trg = get_datahandler(config["datadir"], convert_to)
timeframes = config.get("timeframes", [config.get("timeframe")])
logger.info(f"Converting candle (OHLCV) for timeframe {timeframes}")
2024-05-12 15:41:55 +00:00
candle_types = [
CandleType.from_string(ct)
for ct in config.get("candle_types", [c.value for c in CandleType])
]
2023-07-09 13:28:05 +00:00
logger.info(candle_types)
2024-05-12 15:41:55 +00:00
paircombs = src.ohlcv_get_available_data(config["datadir"], TradingMode.SPOT)
paircombs.extend(src.ohlcv_get_available_data(config["datadir"], TradingMode.FUTURES))
2023-07-09 13:28:05 +00:00
2024-05-12 15:41:55 +00:00
if "pairs" in config:
2023-07-09 13:28:05 +00:00
# Filter pairs
2024-05-12 15:41:55 +00:00
paircombs = [comb for comb in paircombs if comb[0] in config["pairs"]]
2023-07-09 13:28:05 +00:00
2024-05-12 15:41:55 +00:00
if "timeframes" in config:
paircombs = [comb for comb in paircombs if comb[1] in config["timeframes"]]
2023-07-09 13:28:05 +00:00
paircombs = [comb for comb in paircombs if comb[2] in candle_types]
paircombs = sorted(paircombs, key=lambda x: (x[0], x[1], x[2].value))
2024-05-12 15:41:55 +00:00
formatted_paircombs = "\n".join(
[f"{pair}, {timeframe}, {candle_type}" for pair, timeframe, candle_type in paircombs]
)
2023-07-09 13:28:05 +00:00
2024-05-12 15:41:55 +00:00
logger.info(
f"Converting candle (OHLCV) data for the following pair combinations:\n"
f"{formatted_paircombs}"
)
2023-07-09 13:28:05 +00:00
for pair, timeframe, candle_type in paircombs:
2024-05-12 15:41:55 +00:00
data = src.ohlcv_load(
pair=pair,
timeframe=timeframe,
timerange=None,
fill_missing=False,
drop_incomplete=False,
startup_candles=0,
candle_type=candle_type,
)
2023-07-09 13:28:05 +00:00
logger.info(f"Converting {len(data)} {timeframe} {candle_type} candles for {pair}")
if len(data) > 0:
2024-05-12 15:41:55 +00:00
trg.ohlcv_store(pair=pair, timeframe=timeframe, data=data, candle_type=candle_type)
2023-07-09 13:28:05 +00:00
if erase and convert_from != convert_to:
logger.info(f"Deleting source data for {pair} / {timeframe}")
src.ohlcv_purge(pair=pair, timeframe=timeframe, candle_type=candle_type)
def reduce_dataframe_footprint(df: DataFrame) -> DataFrame:
"""
Ensure all values are float32 in the incoming dataframe.
:param df: Dataframe to be converted to float/int 32s
:return: Dataframe converted to float/int 32s
"""
2024-05-12 15:51:21 +00:00
logger.debug(f"Memory usage of dataframe is {df.memory_usage().sum() / 1024**2:.2f} MB")
df_dtypes = df.dtypes
for column, dtype in df_dtypes.items():
2024-05-12 15:41:55 +00:00
if column in ["open", "high", "low", "close", "volume"]:
continue
if dtype == np.float64:
df_dtypes[column] = np.float32
elif dtype == np.int64:
df_dtypes[column] = np.int32
df = df.astype(df_dtypes)
2024-05-12 15:51:21 +00:00
logger.debug(f"Memory usage after optimization is: {df.memory_usage().sum() / 1024**2:.2f} MB")
return df