import numpy as np
import random
from collections import deque
class SelfImprovingAgent:
def __init__(self, action_space, state_space, learning_rate=0.1, discount_factor=0.95):
self.action_space = action_space
self.state_space = state_space
self.lr = learning_rate
self.gamma = discount_factor
self.q_table = np.zeros((state_space, action_space))
self.experience_memory = deque(maxlen=10000)
def choose_action(self, state, epsilon=0.1):
if random.random() < epsilon:
return random.randint(0, self.action_space - 1)
else:
return np.argmax(self.q_table[state])
def learn(self, state, action, reward, next_state):
predict = self.q_table[state, action]
target = reward + self.gamma * np.max(self.q_table[next_state])
self.q_table[state, action] += self.lr * (target - predict)
self.experience_memory.append((state, action, reward, next_state))
def self_improve(self, batch_size=32):
if len(self.experience_memory) < batch_size:
return
batch = random.sample(self.experience_memory, batch_size)
for state, action, reward, next_state in batch:
predict = self.q_table[state, action]
target = reward + self.gamma * np.max(self.q_table[next_state])
self.q_table[state, action] += self.lr * (target - predict)
# Simulated environment for demonstration
class SimpleEnv:
def __init__(self):
self.state_space = 10
self.action_space = 2
self.state = 0
def reset(self):
self.state = 0
return self.state
def step(self, action):
# Simple rule: action 1 moves state forward, action 0 stays
if action == 1 and self.state < self.state_space - 1:
self.state += 1
reward = 10
else:
reward = -1
done = self.state == self.state_space - 1
return self.state, reward, done
# Training loop
agent = SelfImprovingAgent(action_space=2, state_space=10)
env = SimpleEnv()
episodes = 100
total_rewards = []
for episode in range(episodes):
state = env.reset()
done = False
total_reward = 0
while not done:
epsilon = max(0.01, 0.2 * (0.995 ** episode))
action = agent.choose_action(state, epsilon)
next_state, reward, done = env.step(action)
agent.learn(state, action, reward, next_state)
state = next_state
total_reward += reward
total_rewards.append(total_reward)
agent.self_improve(batch_size=32)
avg_train_reward = np.mean(total_rewards[-20:])
print(f"Recent average train reward: {avg_train_reward:.2f}")
# Multi-episode evaluation
num_eval = 50
total_correct = 0
total_steps = 0
eval_rewards = []
for _ in range(num_eval):
state = env.reset()
done = False
corr = 0
steps = 0
ep_r = 0
while not done:
action = agent.choose_action(state, epsilon=0)
next_state, reward, done = env.step(action)
if action == 1:
corr += 1
ep_r += reward
steps += 1
state = next_state
total_correct += corr
total_steps += steps
eval_rewards.append(ep_r)
accuracy = (total_correct / total_steps * 100) if total_steps > 0 else 0
avg_eval_reward = np.mean(eval_rewards)
print(f"Agent accuracy: {accuracy:.2f}%")
print(f"Average eval reward per episode: {avg_eval_reward:.2f}")