import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Sample data
texts = [
'the cat sat on the mat',
'dogs are great pets',
'the quick brown fox jumps',
'lorem ipsum dolor sit amet',
'machine learning is fun',
'natural language processing',
'deep learning models',
'artificial intelligence',
'data science and ai',
'python programming language'
]
labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # Binary labels
# Function to create context windows
def create_context_windows(texts, window_size):
new_texts = []
for text in texts:
words = text.split()
for i in range(len(words)):
start = max(0, i - window_size)
context = words[start:i]
# Join context words as a string
new_texts.append(' '.join(context))
return new_texts
# Increase context window from 5 to 10
window_size = 10
context_texts = create_context_windows(texts, window_size)
# Since labels correspond to original texts, replicate labels accordingly
expanded_labels = []
for label, text in zip(labels, texts):
words = text.split()
expanded_labels.extend([label] * len(words))
# Vectorize text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(context_texts)
y = np.array(expanded_labels)
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Train logistic regression
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
# Predict and evaluate
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)
train_acc = accuracy_score(y_train, train_preds) * 100
val_acc = accuracy_score(y_val, val_preds) * 100
print(f'Training accuracy: {train_acc:.2f}%')
print(f'Validation accuracy: {val_acc:.2f}%')