Introduction
SVM helps us sort text into groups by finding the best line that separates different categories clearly.
Jump into concepts and practice - no test required
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC # Convert text to numbers vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(train_texts) X_test = vectorizer.transform(test_texts) # Create SVM model model = SVC(kernel='linear') # Train model model.fit(X_train, train_labels) # Predict new data predictions = model.predict(X_test)
model = SVC(kernel='linear')vectorizer = TfidfVectorizer(stop_words='english')predictions = model.predict(X_test)
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn.metrics import accuracy_score # Sample text data train_texts = [ 'I love this movie', 'This film was terrible', 'Amazing story and great acting', 'Worst movie ever', 'I enjoyed the film a lot' ] train_labels = [1, 0, 1, 0, 1] # 1=positive, 0=negative test_texts = [ 'I hate this movie', 'What a fantastic film' ] def main(): # Convert text to numbers vectorizer = TfidfVectorizer(stop_words='english') X_train = vectorizer.fit_transform(train_texts) X_test = vectorizer.transform(test_texts) # Create and train SVM model model = SVC(kernel='linear') model.fit(X_train, train_labels) # Predict test data predictions = model.predict(X_test) # Show predictions print('Predictions:', predictions.tolist()) # For demonstration, assume true labels for test true_labels = [0, 1] accuracy = accuracy_score(true_labels, predictions) print(f'Accuracy: {accuracy:.2f}') if __name__ == '__main__': main()
CountVectorizer() or TfidfVectorizer() to transform text into numbers -> Option Aprint(predicted_labels)?
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC texts = ["I love cats", "Dogs are great", "Cats are cute", "I hate dogs"] labels = [1, 0, 1, 0] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(texts) model = LinearSVC() model.fit(X, labels) new_texts = ["I love dogs", "Cats are great"] X_new = vectorizer.transform(new_texts) predicted_labels = model.predict(X_new)
ValueError: could not convert string to float. What is the most likely cause?