import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random
# Load blank English model
nlp = spacy.blank('en')
# Add NER pipe
if 'ner' not in nlp.pipe_names:
ner = nlp.add_pipe('ner')
else:
ner = nlp.get_pipe('ner')
# Add labels (example labels)
labels = ['PERSON', 'ORG', 'GPE']
for label in labels:
ner.add_label(label)
# Example training data
TRAIN_DATA = [
("Apple is looking at buying U.K. startup for $1 billion", {'entities': [(0, 5, 'ORG'), (27, 31, 'GPE')]}),
("San Francisco considers banning sidewalk delivery robots", {'entities': [(0, 13, 'GPE')]}),
("London is a big city in the United Kingdom.", {'entities': [(0, 6, 'GPE'), (31, 44, 'GPE')]})
]
# Disable other pipes during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
# Adjust optimizer parameters for lower learning rate
optimizer.alpha = 0.001
for itn in range(30):
random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
for batch in batches:
examples = []
for text, annotations in batch:
doc = nlp.make_doc(text)
examples.append(Example.from_dict(doc, annotations))
nlp.update(
examples,
drop=0.3, # Added dropout to reduce overfitting
sgd=optimizer,
losses=losses
)
if itn % 5 == 0:
print(f"Iteration {itn}, Losses: {losses}")
# Evaluate on validation data
VALIDATION_DATA = [
("Google is opening a new office in New York", {'entities': [(0, 6, 'ORG'), (31, 39, 'GPE')]}),
("Amazon plans to hire more employees in Seattle", {'entities': [(0, 6, 'ORG'), (38, 45, 'GPE')]})
]
correct = 0
total_pred = 0
total_true = 0
for text, annotations in VALIDATION_DATA:
doc = nlp(text)
pred_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
true_ents = annotations.get('entities')
pred_set = set(pred_ents)
true_set = set(true_ents)
correct += len(pred_set & true_set)
total_pred += len(pred_set)
total_true += len(true_set)
precision = correct / total_pred if total_pred > 0 else 0
recall = correct / total_true if total_true > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
print(f"Validation Precision: {precision:.2f}")
print(f"Validation Recall: {recall:.2f}")
print(f"Validation F1 Score: {f1:.2f}")