import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Sample data with Unicode characters
texts = [
'Café is nice',
'naïve approach',
'Pokémon is popular',
'façade of the building',
'coöperate with others',
'smörgåsbord is Swedish',
'touché move',
'résumé writing',
'São Paulo city',
'niño playing'
]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
# Unicode normalization function
def normalize_text(text):
# Normalize to NFC form (composed characters)
return unicodedata.normalize('NFC', text)
# Apply normalization
texts_normalized = [normalize_text(t) for t in texts]
# Split data
X_train, X_val, y_train, y_val = train_test_split(texts_normalized, labels, test_size=0.3, random_state=42)
# Use CountVectorizer with default tokenizer (Unicode-aware)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
# Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)
# Predict and evaluate
train_preds = model.predict(X_train_vec)
val_preds = model.predict(X_val_vec)
train_acc = accuracy_score(y_train, train_preds) * 100
val_acc = accuracy_score(y_val, val_preds) * 100
print(f'Training accuracy: {train_acc:.2f}%')
print(f'Validation accuracy: {val_acc:.2f}%')