Overview¶
This is where the “deep” part of the deep reinforcement learning framework come in. Learning agents are where the math (read: magic) happens.
At each time step, the agent takes the observation from the environment as input, runs it through its underlying model (a neural network most of the time), and outputs the action to take. For example, the observation might be the previous open
, high
, low
, and close
price from the exchange. The learning model would take these values as input and output a value corresponding to the action to take, such as buy
, sell
, or hold
.
It is important to remember the learning model has no intuition of the prices or trades being represented by these values. Rather, the model is simply learning which values to output for specific input values or sequences of input values, to earn the highest reward.
In this example, we will be using the Stable Baselines library to provide learning agents to our trading scheme, however, the TensorTrade framework is compatible with many reinforcement learning libraries such as Tensorforce, Ray’s RLLib, OpenAI’s Baselines, Intel’s Coach, or anything from the TensorFlow line such as TF Agents.
It is possible that custom TensorTrade learning agents will be added to this framework in the future, though it will always be a goal of the framework to be interoperable with as many existing reinforcement learning libraries as possible, since there is so much concurrent growth in the space. But for now, Stable Baselines is simple and powerful enough for our needs.
Ray¶
The following is an example of how to train a strategy on ray
using the PPO
algorithm.
import ray
import numpy as np
from ray import tune
from ray.tune.registry import register_env
import tensortrade.env.default as default
from tensortrade.feed.core import DataFeed, Stream
from tensortrade.oms.instruments import Instrument
from tensortrade.oms.exchanges import Exchange
from tensortrade.oms.services.execution.simulated import execute_order
from tensortrade.oms.wallets import Wallet, Portfolio
USD = Instrument("USD", 2, "U.S. Dollar")
TTC = Instrument("TTC", 8, "TensorTrade Coin")
def create_env(config):
x = np.arange(0, 2 * np.pi, 2 * np.pi / 1000)
p = Stream.source(50 * np.sin(3 * x) + 100, dtype="float").rename("USD-TTC")
bitfinex = Exchange("bitfinex", service=execute_order)(
p
)
cash = Wallet(bitfinex, 100000 * USD)
asset = Wallet(bitfinex, 0 * TTC)
portfolio = Portfolio(USD, [
cash,
asset
])
feed = DataFeed([
p,
p.rolling(window=10).mean().rename("fast"),
p.rolling(window=50).mean().rename("medium"),
p.rolling(window=100).mean().rename("slow"),
p.log().diff().fillna(0).rename("lr")
])
reward_scheme = tensortrade.env.rewards.rewards.PBR(price=p)
action_scheme = default.actions.BSH(
cash=cash,
asset=asset
).attach(reward_scheme)
env = default.create(
feed=feed,
portfolio=portfolio,
action_scheme=action_scheme,
reward_scheme=reward_scheme,
window_size=config["window_size"],
max_allowed_loss=0.6
)
return env
register_env("TradingEnv", create_env)
analysis = tune.run(
"PPO",
stop={
"episode_reward_mean": 500
},
config={
"env": "TradingEnv",
"env_config": {
"window_size": 25
},
"log_level": "DEBUG",
"framework": "torch",
"ignore_worker_failures": True,
"num_workers": 1,
"num_gpus": 0,
"clip_rewards": True,
"lr": 8e-6,
"lr_schedule": [
[0, 1e-1],
[int(1e2), 1e-2],
[int(1e3), 1e-3],
[int(1e4), 1e-4],
[int(1e5), 1e-5],
[int(1e6), 1e-6],
[int(1e7), 1e-7]
],
"gamma": 0,
"observation_filter": "MeanStdFilter",
"lambda": 0.72,
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01
},
checkpoint_at_end=True
)
And then to restore the agent just use the following code.
import ray.rllib.agents.ppo as ppo
# Get checkpoint
checkpoints = analysis.get_trial_checkpoints_paths(
trial=analysis.get_best_trial("episode_reward_mean"),
metric="episode_reward_mean"
)
checkpoint_path = checkpoints[0][0]
# Restore agent
agent = ppo.PPOTrainer(
env="TradingEnv",
config={
"env_config": {
"window_size": 25
},
"framework": "torch",
"log_level": "DEBUG",
"ignore_worker_failures": True,
"num_workers": 1,
"num_gpus": 0,
"clip_rewards": True,
"lr": 8e-6,
"lr_schedule": [
[0, 1e-1],
[int(1e2), 1e-2],
[int(1e3), 1e-3],
[int(1e4), 1e-4],
[int(1e5), 1e-5],
[int(1e6), 1e-6],
[int(1e7), 1e-7]
],
"gamma": 0,
"observation_filter": "MeanStdFilter",
"lambda": 0.72,
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01
}
)
agent.restore(checkpoint_path)
Stable Baselines¶
from stable_baselines3 import PPO
model = PPO(
'MlpPolicy',
env,
verbose=1
)
model.learn(10_000)