import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random
# Load blank English model
nlp = spacy.blank('en')
# Create NER pipeline component
if 'ner' not in nlp.pipe_names:
ner = nlp.add_pipe('ner')
else:
ner = nlp.get_pipe('ner')
# Add labels
LABELS = ['PRODUCT', 'ORG']
for label in LABELS:
ner.add_label(label)
# Training data: list of tuples (text, {'entities': [(start, end, label), ...]})
TRAIN_DATA = [
('Apple releases new iPhone', {'entities': [(0, 5, 'ORG'), (19, 25, 'PRODUCT')]}),
('Google launches Pixel', {'entities': [(0, 6, 'ORG'), (16, 21, 'PRODUCT')]}),
('Microsoft updates Windows', {'entities': [(0, 9, 'ORG'), (18, 25, 'PRODUCT')]}),
]
# Disable other pipes to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.initialize()
# Set dropout to 0.3 to reduce overfitting
dropout = 0.3
# Lower learning rate
optimizer.alpha = 0.001
for epoch in range(20): # Reduced epochs from 50 to 20
random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(2.0, 4.0, 1.5))
for batch in batches:
examples = []
for text, annotations in batch:
doc = nlp.make_doc(text)
examples.append(Example.from_dict(doc, annotations))
nlp.update(examples, drop=dropout, losses=losses, sgd=optimizer)
# Early stopping simulation: stop if loss is low enough
if losses.get('ner', 0) < 0.01:
break
# Evaluate on validation data
VALIDATION_DATA = [
('Apple unveils new MacBook', {'entities': [(0, 5, 'ORG'), (18, 25, 'PRODUCT')]}),
('Google announces Android update', {'entities': [(0, 6, 'ORG'), (17, 24, 'PRODUCT')]}),
]
correct = 0
total_pred = 0
total_true = 0
for text, annotations in VALIDATION_DATA:
doc = nlp(text)
pred_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
true_ents = annotations['entities']
total_pred += len(pred_ents)
total_true += len(true_ents)
for ent in pred_ents:
if ent in true_ents:
correct += 1
precision = correct / total_pred if total_pred > 0 else 0
recall = correct / total_true if total_true > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
print(f'Validation Precision: {precision:.2f}')
print(f'Validation Recall: {recall:.2f}')
print(f'Validation F1-score: {f1:.2f}')