import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Simulated dataset with labels and a 'harmful' flag
X = np.random.rand(1000, 10)
y = (X[:, 0] + X[:, 1] > 1).astype(int) # simple label
harmful_flags = (X[:, 2] > 0.8).astype(int) # simulate harmful outputs
# Split data
X_train, X_val, y_train, y_val, harmful_train, harmful_val = train_test_split(
X, y, harmful_flags, test_size=0.2, random_state=42
)
# Simple model: logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_val)
# Calculate harmful output rate before safety filter
harmful_pred = (X_val[:, 2] > 0.8).astype(int)
harmful_output_rate_before = harmful_pred.mean()
# Safety filter: block outputs if feature 2 > 0.7 (stricter threshold)
safe_mask = X_val[:, 2] <= 0.7
# Apply safety filter to predictions
y_pred_safe = np.where(safe_mask, y_pred, 0) # block harmful by forcing output to 0
# Calculate harmful output rate after safety filter
harmful_output_rate_after = (np.logical_and(y_pred_safe == 1, ~safe_mask)).mean()
# Calculate accuracy before and after
accuracy_before = accuracy_score(y_val, y_pred)
accuracy_after = accuracy_score(y_val, y_pred_safe)
# Bias mitigation: simulate by balancing training data
safe_indices = np.where(harmful_train == 0)[0]
harmful_indices = np.where(harmful_train == 1)[0]
# Downsample harmful examples to reduce bias
np.random.seed(42)
harmful_downsampled = np.random.choice(harmful_indices, size=len(safe_indices), replace=False)
balanced_indices = np.concatenate([safe_indices, harmful_downsampled])
X_train_balanced = X_train[balanced_indices]
y_train_balanced = y_train[balanced_indices]
# Retrain model on balanced data
model_balanced = LogisticRegression(max_iter=200)
model_balanced.fit(X_train_balanced, y_train_balanced)
# Predict with balanced model
y_pred_balanced = model_balanced.predict(X_val)
# Apply safety filter again
y_pred_balanced_safe = np.where(safe_mask, y_pred_balanced, 0)
# Calculate metrics after bias mitigation and safety filter
harmful_output_rate_final = (np.logical_and(y_pred_balanced_safe == 1, ~safe_mask)).mean()
accuracy_final = accuracy_score(y_val, y_pred_balanced_safe)
# Bias score: difference in harmful output rates between groups
bias_score_before = abs(harmful_pred.mean() - harmful_flags.mean())
bias_score_after = abs((y_pred_balanced_safe == 1).mean() - harmful_flags.mean())
print(f"Harmful output rate before: {harmful_output_rate_before:.2f}")
print(f"Harmful output rate after safety filter: {harmful_output_rate_after:.2f}")
print(f"Harmful output rate final: {harmful_output_rate_final:.2f}")
print(f"Accuracy before: {accuracy_before:.2f}")
print(f"Accuracy after safety filter: {accuracy_after:.2f}")
print(f"Accuracy final: {accuracy_final:.2f}")
print(f"Bias score before: {bias_score_before:.2f}")
print(f"Bias score after: {bias_score_after:.2f}")