Examples
Complete working scripts for common training scenarios
These examples cover the four most common gym workflows. Each script is self-contained — copy, fill in your API key, and run.
Example 1 — Random Agent (Sanity Check)
Before training, verify the environment works with a random agent. If this script runs without error, your platform connection and API key are working correctly.
import gymnasium as gym
import tradeready_gym
env = gym.make(
"TradeReady-BTC-v0",
api_key="ak_live_...",
starting_balance=10000,
timeframe="1h",
start_time="2025-01-01T00:00:00Z",
end_time="2025-02-01T00:00:00Z",
track_training=False, # skip tracking for this test
)
obs, info = env.reset()
total_reward = 0.0
steps = 0
while True:
action = env.action_space.sample() # random action: 0, 1, or 2
obs, reward, terminated, truncated, info = env.step(action)
total_reward += reward
steps += 1
if terminated or truncated:
break
print(f"Steps: {steps}")
print(f"Total reward: {total_reward:.4f}")
print(f"Final equity: ${info['equity']:.2f}")
env.close()
Expected output: the script completes without error and prints a final equity value. The random agent will typically perform poorly (near break-even or slightly negative) — that is expected.
Example 2 — PPO Training with Stable-Baselines3
Train a PPO agent on BTC/USDT with a Sharpe-based reward. This is the recommended starting point for RL training.
import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import SharpeReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
import os
API_KEY = "ak_live_..."
# Training environment
train_env = gym.make(
"TradeReady-BTC-Continuous-v0",
api_key=API_KEY,
starting_balance=10000,
timeframe="1h",
lookback_window=50,
observation_features=[
"ohlcv", "rsi_14", "macd", "bollinger",
"adx", "atr", "balance", "position",
],
reward_function=SharpeReward(window=50),
start_time="2025-01-01T00:00:00Z",
end_time="2025-06-01T00:00:00Z",
track_training=True,
strategy_label="ppo_sharpe_v1",
)
train_env = NormalizationWrapper(train_env)
# Train
model = PPO(
"MlpPolicy",
train_env,
verbose=1,
learning_rate=3e-4,
n_steps=2048,
batch_size=64,
n_epochs=10,
gamma=0.99,
gae_lambda=0.95,
)
model.learn(total_timesteps=200_000)
model.save("ppo_btc_sharpe_v1")
train_env.close()
print("Training complete. Check /training in the dashboard for learning curves.")
Install Stable-Baselines3 with: pip install stable-baselines3 torch
Evaluating the Trained Model
After training, evaluate on a held-out period:
eval_env = gym.make(
"TradeReady-BTC-Continuous-v0",
api_key=API_KEY,
start_time="2025-07-01T00:00:00Z",
end_time="2025-09-01T00:00:00Z",
track_training=False,
)
eval_env = NormalizationWrapper(eval_env)
model = PPO.load("ppo_btc_sharpe_v1")
obs, info = eval_env.reset()
while True:
action, _ = model.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = eval_env.step(action)
if terminated or truncated:
break
print(f"Eval ROI: {info.get('roi_pct', 0):.2f}%")
eval_env.close()
Example 3 — Custom Reward Function
Build a reward that combines log returns with a drawdown penalty. This encourages consistent gains while limiting exposure to large losses.
import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import CustomReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO
import math
class LogReturnWithDrawdownPenalty(CustomReward):
"""
Reward = log return - penalty * current drawdown fraction.
More stable gradients than raw PnL, with capital preservation.
"""
def __init__(self, penalty: float = 0.3):
self.penalty = penalty
self._peak = 0.0
def compute(self, prev_equity: float, curr_equity: float, info: dict) -> float:
# Log return for stable gradients
if prev_equity <= 0 or curr_equity <= 0:
return 0.0
log_return = math.log(curr_equity / prev_equity)
# Drawdown penalty
self._peak = max(self._peak, curr_equity)
drawdown = (self._peak - curr_equity) / self._peak if self._peak > 0 else 0.0
return log_return - self.penalty * drawdown
def reset(self) -> None:
self._peak = 0.0
env = gym.make(
"TradeReady-BTC-Continuous-v0",
api_key="ak_live_...",
reward_function=LogReturnWithDrawdownPenalty(penalty=0.3),
start_time="2025-01-01T00:00:00Z",
end_time="2025-06-01T00:00:00Z",
track_training=True,
strategy_label="log_drawdown_v1",
)
env = NormalizationWrapper(env)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100_000)
model.save("ppo_log_drawdown")
env.close()
Example 4 — Portfolio Allocation
Train an agent to allocate across BTC, ETH, and SOL simultaneously. The agent outputs three target weights that sum to at most 1.0.
import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import SortinoReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO
env = gym.make(
"TradeReady-Portfolio-v0",
api_key="ak_live_...",
pairs=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
starting_balance=50000,
timeframe="1h",
lookback_window=30,
observation_features=["ohlcv", "rsi_14", "macd", "balance", "position"],
reward_function=SortinoReward(window=30),
start_time="2025-01-01T00:00:00Z",
end_time="2025-06-01T00:00:00Z",
track_training=True,
strategy_label="portfolio_sortino_v1",
)
env = NormalizationWrapper(env)
# action_space = Box(0.0, 1.0, shape=(3,))
# [BTC_weight, ETH_weight, SOL_weight]
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=150_000)
model.save("ppo_portfolio_v1")
env.close()
# Quick test of the saved model
test_env = gym.make(
"TradeReady-Portfolio-v0",
api_key="ak_live_...",
pairs=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
start_time="2025-07-01T00:00:00Z",
end_time="2025-09-01T00:00:00Z",
track_training=False,
)
test_env = NormalizationWrapper(test_env)
model = PPO.load("ppo_portfolio_v1")
obs, info = test_env.reset()
while True:
action, _ = model.predict(obs, deterministic=True)
obs, _, terminated, truncated, info = test_env.step(action)
if terminated or truncated:
break
print(f"Test equity: ${info['equity']:.2f}")
test_env.close()
Full Strategy Improvement Workflow
Combining rule-based strategy testing with RL training:
from agentexchange import AgentExchangeClient
import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import SharpeReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO
import time
client = AgentExchangeClient(api_key="ak_live_...")
# Step 1: Create and test a rule-based strategy
strategy = client.create_strategy(
name="My BTC Strategy",
definition={
"pairs": ["BTCUSDT"],
"timeframe": "1h",
"entry_conditions": {"rsi_below": 30, "macd_cross_above": True, "adx_above": 25},
"exit_conditions": {"stop_loss_pct": 3, "take_profit_pct": 8, "trailing_stop_pct": 2},
"position_size_pct": 10,
"max_positions": 2
}
)
sid = strategy["strategy_id"]
test = client.run_test(sid, version=1, episodes=20,
date_range={"start": "2025-01-01", "end": "2025-07-01"})
while True:
status = client.get_test_status(sid, test["test_run_id"])
if status["status"] in ("completed", "failed"):
break
time.sleep(5)
results = client.get_test_results(sid, test["test_run_id"])
print(f"Rule-based avg ROI: {results['results']['avg_roi_pct']}%")
print(f"Recommendations: {results['recommendations']}")
# Step 2: Train an RL agent on the same period
env = gym.make(
"TradeReady-BTC-Continuous-v0",
api_key="ak_live_...",
reward_function=SharpeReward(window=50),
start_time="2025-01-01T00:00:00Z",
end_time="2025-07-01T00:00:00Z",
track_training=True,
strategy_label="rl_vs_rule_based",
)
env = NormalizationWrapper(env)
model = PPO("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=100_000)
model.save("ppo_btc")
env.close()
print("Compare rule-based vs RL in /training dashboard")
Next Steps
- Training Tracking — view learning curves in the dashboard
- Strategy Testing — test rule-based strategies for comparison
- Backtesting — manual backtesting for strategy validation