import time
import random
# Simulate AI agent response with latency and cost tracking
class Agent:
def __init__(self, base_latency_ms=1200, cost_per_request=0.005):
self.base_latency_ms = base_latency_ms
self.cost_per_request = cost_per_request
self.cache = {}
def respond(self, query):
if query in self.cache:
# Cached response is faster and free
latency = 50
cost = 0
response = self.cache[query]
else:
# Simulate processing time
latency = self.base_latency_ms
cost = self.cost_per_request
response = f"Answer to '{query}'"
self.cache[query] = response
time.sleep(latency / 1000)
return response, latency, cost
# Benchmark function
def benchmark(agent, queries):
total_latency = 0
total_cost = 0
for q in queries:
_, latency, cost = agent.respond(q)
total_latency += latency
total_cost += cost
avg_latency = total_latency / len(queries)
cost_per_1000 = total_cost * (1000 / len(queries))
return avg_latency, cost_per_1000
# Original agent
original_agent = Agent()
queries = ["What is AI?", "Define machine learning.", "What is AI?", "Explain latency."] * 250
# Benchmark original
orig_latency, orig_cost = benchmark(original_agent, queries)
# Optimized agent with batching and caching improvements
class OptimizedAgent(Agent):
def respond_batch(self, batch_queries):
responses = []
batch_latency = 0
batch_cost = 0
for q in batch_queries:
if q in self.cache:
latency = 50
cost = 0
response = self.cache[q]
else:
latency = 600 # Reduced latency by half due to batching
cost = 0.003 # Reduced cost per request
response = f"Answer to '{q}'"
self.cache[q] = response
batch_latency += latency
batch_cost += cost
responses.append(response)
# Simulate batch processing time
time.sleep(batch_latency / 1000)
return responses, batch_latency, batch_cost
def respond(self, query):
# Single respond calls use batch with one query
responses, latency, cost = self.respond_batch([query])
return responses[0], latency, cost
optimized_agent = OptimizedAgent()
opt_latency, opt_cost = benchmark(optimized_agent, queries)
print(f"Original avg latency: {orig_latency} ms, cost per 1000: ${orig_cost:.2f}")
print(f"Optimized avg latency: {opt_latency} ms, cost per 1000: ${opt_cost:.2f}")