Source code for tensortrade.pipeline.transformers.catboost_feature_importance
# Copyright 2024 The TensorTrade-NG Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from __future__ import annotations
import typing
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from tensortrade.pipeline.transformers.abstract import AbstractTransformer
if typing.TYPE_CHECKING:
from typing import Optional
from pandas import DataFrame
[docs]
class CatBoostFeatureImportanceTransformer(AbstractTransformer):
"""Transformer for selecting top features based on feature importance with a target variable, calculated by
CatBoostRegressor.
:param num_features: The number of top features to select. (Default = 20)
:type num_features: int
:param seed: The seed used for the feature importance score regression. (Default = 42)
:type seed: int
:param iterations: CatBoostRegressor iterations. Should be at minimum 5 to 10 times the number of features. (Default = 1000)
:type iterations: int
:param target_column: The name of the target column on which the mutual information score should be calculated. (Default = 'close')
:type target_column: str
:param target_shift: The number of periods to shift the target column to create the prediction target. (Default = 3)
:type target_shift: int
:param task_type: The type of the CatBoostRegressor task, can be CPU or GPU. (Default = 'CPU')
:type task_type: str
:param learning_rate: Learning rate used for the CatBoostRegressor. If None it is chosen dynamical by CatBoost.
:type learning_rate: float
:param max_depth: Max depth used for the CatBoostRegressor. (Default = 8)
:type max_depth: int
"""
def __init__(self,
num_features: int = 20,
seed: int = 42,
*,
iterations: int = 1000,
target_column: str = 'close',
target_shift: int = 3,
task_type: str = 'CPU',
learning_rate: Optional[float] = None,
max_depth: int = 8):
self.num_features = num_features
self.seed = seed
self.iterations = iterations
self.target_column = target_column
self.target_shift = target_shift
self.task_type = task_type
self.learning_rate = learning_rate
self.max_depth = max_depth
[docs]
def transform(self, df: DataFrame) -> DataFrame:
"""Transforms the input DataFrame by selecting the top features based on feature importance score with the
target variable, calculated with CatBoostRegressor
:param df: The input DataFrame containing the features and target column.
:type df: DataFrame
:return: A DataFrame reduced to the top features based on feature importance scores.
:rtype: DataFrame
"""
# Create a new DataFrame with shifted target column
test_df = df.copy()
test_df['target_predict'] = test_df[self.target_column].shift(-self.target_shift)
test_df.dropna(inplace=True)
# Create X and y for training
X = test_df.drop(columns=['target_predict'])
y = test_df['target_predict']
# split data
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.33,
shuffle=False,
random_state=self.seed
)
# Create catboost regressor
model = CatBoostRegressor(iterations=self.iterations,
learning_rate=self.learning_rate,
task_type=self.task_type,
max_depth=self.max_depth,
random_seed=self.seed,
loss_function='RMSE',
verbose=0)
# Create train and eval pools
train_pool = Pool(X_train, y_train)
eval_pool = Pool(X_test, y_test)
# Train model
model.fit(train_pool, eval_set=eval_pool)
# Get feature importance score
feature_importances = model.get_feature_importance(train_pool)
# create series with importance scores
importance_series = pd.Series(feature_importances, index=X_train.columns)
importance_series = importance_series.sort_values(ascending=False)
# get list of top features
top_features = importance_series.head(self.num_features).index
# Ensure 'self.target_column' is always in the top features and never removed
if self.target_column not in top_features:
top_features.insert(0, self.target_column)
top_features.pop()
# return only features in top feature list
return df[top_features]