import time
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
class LRUCache:
def __init__(self, capacity=1000):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key):
if key not in self.cache:
return None
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key, value):
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.capacity:
self.cache.popitem(last=False)
class LLMWithCache:
def __init__(self, llm_function, cache_capacity=1000, similarity_threshold=0.8):
self.llm = llm_function
self.cache = LRUCache(cache_capacity)
self.queries = []
self.embeddings = np.empty((0, 0))
self.similarity_threshold = similarity_threshold
def embed_query(self, query):
# Fit vectorizer on all queries plus new one
all_queries = self.queries + [query]
vectorizer = TfidfVectorizer().fit(all_queries)
embeddings = vectorizer.transform(all_queries).toarray()
self.embeddings = embeddings[:-1]
return embeddings[-1].reshape(1, -1)
def find_similar_query(self, query_embedding):
if len(self.queries) == 0:
return None
similarities = cosine_similarity(query_embedding, self.embeddings)[0]
max_idx = np.argmax(similarities)
if similarities[max_idx] >= self.similarity_threshold:
return self.queries[max_idx]
return None
def query(self, query_text):
# Check exact cache
cached_answer = self.cache.get(query_text)
if cached_answer is not None:
return cached_answer, True
# Check approximate cache
query_emb = self.embed_query(query_text)
similar_query = self.find_similar_query(query_emb)
if similar_query:
cached_answer = self.cache.get(similar_query)
if cached_answer:
# Add new query to cache for faster future access
self.cache.put(query_text, cached_answer)
self.queries.append(query_text)
self.embeddings = np.vstack([self.embeddings, query_emb])
return cached_answer, True
# Compute answer from LLM
answer = self.llm(query_text)
self.cache.put(query_text, answer)
self.queries.append(query_text)
if self.embeddings.size == 0:
self.embeddings = query_emb
else:
self.embeddings = np.vstack([self.embeddings, query_emb])
return answer, False
# Dummy LLM function simulating delay
import random
def dummy_llm(query):
time.sleep(5) # Simulate slow response
return f"Answer to '{query}'"
# Testing the caching system
llm_cache = LLMWithCache(dummy_llm)
queries = ["What is AI?", "Define artificial intelligence.", "What is AI?", "Explain machine learning.", "Explain machine learning."]
results = []
start_time = time.time()
for q in queries:
answer, from_cache = llm_cache.query(q)
results.append((q, answer, from_cache))
end_time = time.time()
average_response_time = (end_time - start_time) / len(queries)
cache_hits = sum(1 for _, _, hit in results if hit)
cache_hit_rate = cache_hits / len(queries) * 100
print(f"Average response time: {average_response_time:.2f} seconds")
print(f"Cache hit rate: {cache_hit_rate:.1f}%")
for q, a, hit in results:
print(f"Query: {q} | From cache: {hit} | Answer: {a}")