This example trains a simple Named Entity Recognition model using a CRF on a tiny dataset. It then predicts entity labels for a new sentence.
from sklearn_crfsuite import CRF
# Sample training data: words and their entity labels
train_sents = [[('John', 'B-PER'), ('lives', 'O'), ('in', 'O'), ('New', 'B-LOC'), ('York', 'I-LOC')]]
# Feature extractor for each word
def word2features(sent, i):
word = sent[i][0]
features = {
'word.lower()': word.lower(),
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
}
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, label in sent]
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
# Train CRF model
crf = CRF(algorithm='lbfgs', max_iterations=100)
crf.fit(X_train, y_train)
# Test sentence
test_sent = [('Mary', 'O'), ('moved', 'O'), ('to', 'O'), ('Los', 'O'), ('Angeles', 'O')]
X_test = [word2features(test_sent, i) for i in range(len(test_sent))]
# Predict entity labels
predicted = crf.predict([X_test])[0]
# Print results
for (word, _), label in zip(test_sent, predicted):
print(f"{word}: {label}")