is-this-bible / data_creation /creat_model_bible.py
NHLOCAL's picture
first create
58d4ef5
raw
history blame
2.35 kB
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
"""
# Download the Hebrew stopwords (if not already downloaded)
nltk.download('stopwords')
# Function to remove punctuation and special characters from text
def remove_punctuation(text):
return re.sub(r'[^\w\s]', '', text)
# Function to remove custom stop words from text
def remove_custom_stopwords(text):
hebrew_stopwords = {'讗谞讬', '讗转讛', '讗转', '讗谞讞谞讜', '讗转诐', '讗转谉', '讛诐', '讛谉'} # Add your custom Hebrew stopwords here
return ' '.join(word for word in text.split() if word not in hebrew_stopwords)
# Remove punctuation and custom stop words from the text data
data['text'] = data['text'].apply(remove_punctuation)
data['text'] = data['text'].apply(remove_custom_stopwords)
"""
# Load the dataset (assuming it is in UTF-8 encoding)
data = pd.read_csv('bible_data.csv', encoding='utf-8')
# Separate features (text) and labels (0 or 1)
X = data['text']
y = data['label']
# Create a TF-IDF vectorizer with Hebrew tokenizer
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True)
# Fit and transform the data with TF-IDF vectorizer
X_tfidf = vectorizer.fit_transform(X)
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=47)
# Create a Support Vector Machine (SVM) classifier
classifier = SVC(kernel='linear', C=0.5, probability=True)
# Train the SVM classifier on the training data
classifier.fit(X_train, y_train)
# Evaluate the model on the test data
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
# Save the trained model and vectorizer to files
model_filename = "is_this_bible_model.pkl"
vectorizer_filename = "is_this_bible_vectorizer.pkl"
joblib.dump(classifier, model_filename)
joblib.dump(vectorizer, vectorizer_filename)