import random
class AIAgent:
def __init__(self, confidence_threshold=0.7):
self.confidence_threshold = confidence_threshold
def make_decision(self, data):
# Simulate AI with 90% base accuracy and calibrated confidence
if random.random() < 0.9:
decision = True
confidence = random.uniform(0.75, 1.0)
else:
decision = False
confidence = random.uniform(0.0, 0.6)
return decision, confidence
class HumanApprovalWorkflow:
def __init__(self, agent, approval_function):
self.agent = agent
self.approval_function = approval_function
def process_task(self, data):
ai_decision, confidence = self.agent.make_decision(data)
flagged = confidence < self.agent.confidence_threshold
if flagged:
# Request human approval
approved = self.approval_function(data, ai_decision)
final_decision = ai_decision if approved else not ai_decision
return final_decision, True, flagged, ai_decision # final, human_approved, flagged, orig_ai
else:
# Autonomous decision
return ai_decision, False, flagged, ai_decision
# Simulated human approval: perfect for simulation
def human_approval(data, ai_decision):
ground_truth = True
return ai_decision == ground_truth
# Comprehensive evaluation
def evaluate_workflow(workflow, num_tasks=1000):
autonomous_correct = 0
autonomous_total = 0
flagged_correct = 0
flagged_total = 0
wrong_original_total = 0
flagged_wrong_original = 0
autonomous_wrong_original = 0
overall_correct = 0
for _ in range(num_tasks):
data = None
final_dec, human_approved, flagged, orig_dec = workflow.process_task(data)
ground_truth = True
is_orig_wrong = (orig_dec != ground_truth)
if is_orig_wrong:
wrong_original_total += 1
if flagged:
flagged_wrong_original += 1
else:
autonomous_wrong_original += 1
if flagged:
flagged_total += 1
if final_dec == ground_truth:
flagged_correct += 1
else:
autonomous_total += 1
if final_dec == ground_truth:
autonomous_correct += 1
if final_dec == ground_truth:
overall_correct += 1
metrics = {
'autonomous_accuracy': (autonomous_correct / autonomous_total * 100) if autonomous_total else 0,
'flagged_accuracy': (flagged_correct / flagged_total * 100) if flagged_total else 0,
'flagged_ratio': (flagged_total / num_tasks * 100),
'autonomy_ratio': (autonomous_total / num_tasks * 100),
'overall_accuracy': (overall_correct / num_tasks * 100),
'error_recall': (flagged_wrong_original / wrong_original_total * 100) if wrong_original_total else 0,
'autonomous_error_rate': (autonomous_wrong_original / autonomous_total * 100) if autonomous_total else 0
}
return metrics
# Setup and run
agent = AIAgent(confidence_threshold=0.7)
workflow = HumanApprovalWorkflow(agent, human_approval)
metrics = evaluate_workflow(workflow)
print(metrics)