TradeReady.io
Gymnasium / RL Training

Examples

Complete working scripts for common training scenarios

Download .md

These examples cover the four most common gym workflows. Each script is self-contained — copy, fill in your API key, and run.


Example 1 — Random Agent (Sanity Check)

Before training, verify the environment works with a random agent. If this script runs without error, your platform connection and API key are working correctly.

import gymnasium as gym
import tradeready_gym

env = gym.make(
    "TradeReady-BTC-v0",
    api_key="ak_live_...",
    starting_balance=10000,
    timeframe="1h",
    start_time="2025-01-01T00:00:00Z",
    end_time="2025-02-01T00:00:00Z",
    track_training=False,  # skip tracking for this test
)

obs, info = env.reset()
total_reward = 0.0
steps = 0

while True:
    action = env.action_space.sample()  # random action: 0, 1, or 2
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    steps += 1

    if terminated or truncated:
        break

print(f"Steps: {steps}")
print(f"Total reward: {total_reward:.4f}")
print(f"Final equity: ${info['equity']:.2f}")
env.close()

Expected output: the script completes without error and prints a final equity value. The random agent will typically perform poorly (near break-even or slightly negative) — that is expected.


Example 2 — PPO Training with Stable-Baselines3

Train a PPO agent on BTC/USDT with a Sharpe-based reward. This is the recommended starting point for RL training.

import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import SharpeReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
import os

API_KEY = "ak_live_..."

# Training environment
train_env = gym.make(
    "TradeReady-BTC-Continuous-v0",
    api_key=API_KEY,
    starting_balance=10000,
    timeframe="1h",
    lookback_window=50,
    observation_features=[
        "ohlcv", "rsi_14", "macd", "bollinger",
        "adx", "atr", "balance", "position",
    ],
    reward_function=SharpeReward(window=50),
    start_time="2025-01-01T00:00:00Z",
    end_time="2025-06-01T00:00:00Z",
    track_training=True,
    strategy_label="ppo_sharpe_v1",
)
train_env = NormalizationWrapper(train_env)

# Train
model = PPO(
    "MlpPolicy",
    train_env,
    verbose=1,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
)
model.learn(total_timesteps=200_000)
model.save("ppo_btc_sharpe_v1")
train_env.close()

print("Training complete. Check /training in the dashboard for learning curves.")

Install Stable-Baselines3 with: pip install stable-baselines3 torch

Evaluating the Trained Model

After training, evaluate on a held-out period:

eval_env = gym.make(
    "TradeReady-BTC-Continuous-v0",
    api_key=API_KEY,
    start_time="2025-07-01T00:00:00Z",
    end_time="2025-09-01T00:00:00Z",
    track_training=False,
)
eval_env = NormalizationWrapper(eval_env)

model = PPO.load("ppo_btc_sharpe_v1")
obs, info = eval_env.reset()

while True:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = eval_env.step(action)
    if terminated or truncated:
        break

print(f"Eval ROI: {info.get('roi_pct', 0):.2f}%")
eval_env.close()

Example 3 — Custom Reward Function

Build a reward that combines log returns with a drawdown penalty. This encourages consistent gains while limiting exposure to large losses.

import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import CustomReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO
import math

class LogReturnWithDrawdownPenalty(CustomReward):
    """
    Reward = log return - penalty * current drawdown fraction.
    More stable gradients than raw PnL, with capital preservation.
    """
    def __init__(self, penalty: float = 0.3):
        self.penalty = penalty
        self._peak = 0.0

    def compute(self, prev_equity: float, curr_equity: float, info: dict) -> float:
        # Log return for stable gradients
        if prev_equity <= 0 or curr_equity <= 0:
            return 0.0
        log_return = math.log(curr_equity / prev_equity)

        # Drawdown penalty
        self._peak = max(self._peak, curr_equity)
        drawdown = (self._peak - curr_equity) / self._peak if self._peak > 0 else 0.0

        return log_return - self.penalty * drawdown

    def reset(self) -> None:
        self._peak = 0.0


env = gym.make(
    "TradeReady-BTC-Continuous-v0",
    api_key="ak_live_...",
    reward_function=LogReturnWithDrawdownPenalty(penalty=0.3),
    start_time="2025-01-01T00:00:00Z",
    end_time="2025-06-01T00:00:00Z",
    track_training=True,
    strategy_label="log_drawdown_v1",
)
env = NormalizationWrapper(env)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100_000)
model.save("ppo_log_drawdown")
env.close()

Example 4 — Portfolio Allocation

Train an agent to allocate across BTC, ETH, and SOL simultaneously. The agent outputs three target weights that sum to at most 1.0.

import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import SortinoReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO

env = gym.make(
    "TradeReady-Portfolio-v0",
    api_key="ak_live_...",
    pairs=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
    starting_balance=50000,
    timeframe="1h",
    lookback_window=30,
    observation_features=["ohlcv", "rsi_14", "macd", "balance", "position"],
    reward_function=SortinoReward(window=30),
    start_time="2025-01-01T00:00:00Z",
    end_time="2025-06-01T00:00:00Z",
    track_training=True,
    strategy_label="portfolio_sortino_v1",
)
env = NormalizationWrapper(env)

# action_space = Box(0.0, 1.0, shape=(3,))
# [BTC_weight, ETH_weight, SOL_weight]
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=150_000)
model.save("ppo_portfolio_v1")
env.close()

# Quick test of the saved model
test_env = gym.make(
    "TradeReady-Portfolio-v0",
    api_key="ak_live_...",
    pairs=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
    start_time="2025-07-01T00:00:00Z",
    end_time="2025-09-01T00:00:00Z",
    track_training=False,
)
test_env = NormalizationWrapper(test_env)
model = PPO.load("ppo_portfolio_v1")

obs, info = test_env.reset()
while True:
    action, _ = model.predict(obs, deterministic=True)
    obs, _, terminated, truncated, info = test_env.step(action)
    if terminated or truncated:
        break

print(f"Test equity: ${info['equity']:.2f}")
test_env.close()

Full Strategy Improvement Workflow

Combining rule-based strategy testing with RL training:

from agentexchange import AgentExchangeClient
import gymnasium as gym
import tradeready_gym
from tradeready_gym.rewards import SharpeReward
from tradeready_gym.wrappers import NormalizationWrapper
from stable_baselines3 import PPO
import time

client = AgentExchangeClient(api_key="ak_live_...")

# Step 1: Create and test a rule-based strategy
strategy = client.create_strategy(
    name="My BTC Strategy",
    definition={
        "pairs": ["BTCUSDT"],
        "timeframe": "1h",
        "entry_conditions": {"rsi_below": 30, "macd_cross_above": True, "adx_above": 25},
        "exit_conditions": {"stop_loss_pct": 3, "take_profit_pct": 8, "trailing_stop_pct": 2},
        "position_size_pct": 10,
        "max_positions": 2
    }
)
sid = strategy["strategy_id"]

test = client.run_test(sid, version=1, episodes=20,
    date_range={"start": "2025-01-01", "end": "2025-07-01"})

while True:
    status = client.get_test_status(sid, test["test_run_id"])
    if status["status"] in ("completed", "failed"):
        break
    time.sleep(5)

results = client.get_test_results(sid, test["test_run_id"])
print(f"Rule-based avg ROI: {results['results']['avg_roi_pct']}%")
print(f"Recommendations: {results['recommendations']}")

# Step 2: Train an RL agent on the same period
env = gym.make(
    "TradeReady-BTC-Continuous-v0",
    api_key="ak_live_...",
    reward_function=SharpeReward(window=50),
    start_time="2025-01-01T00:00:00Z",
    end_time="2025-07-01T00:00:00Z",
    track_training=True,
    strategy_label="rl_vs_rule_based",
)
env = NormalizationWrapper(env)

model = PPO("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=100_000)
model.save("ppo_btc")
env.close()

print("Compare rule-based vs RL in /training dashboard")

Next Steps

On this page