Spaces:

NHLOCAL
/

is-this-bible

Running

App Files Files Community

is-this-bible / data_creation /creat_model_bible.py

NHLOCAL

first create

58d4ef5 8 months ago

raw

history blame

2.35 kB

	import pandas as pd
	import re
	import nltk
	from nltk.tokenize import word_tokenize
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.svm import SVC
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
	import joblib


	"""
	# Download the Hebrew stopwords (if not already downloaded)
	nltk.download('stopwords')

	# Function to remove punctuation and special characters from text
	def remove_punctuation(text):
	return re.sub(r'[^\w\s]', '', text)

	# Function to remove custom stop words from text
	def remove_custom_stopwords(text):
	hebrew_stopwords = {'אני', 'אתה', 'את', 'אנחנו', 'אתם', 'אתן', 'הם', 'הן'} # Add your custom Hebrew stopwords here
	return ' '.join(word for word in text.split() if word not in hebrew_stopwords)

	# Remove punctuation and custom stop words from the text data
	data['text'] = data['text'].apply(remove_punctuation)
	data['text'] = data['text'].apply(remove_custom_stopwords)
	"""

	# Load the dataset (assuming it is in UTF-8 encoding)
	data = pd.read_csv('bible_data.csv', encoding='utf-8')



	# Separate features (text) and labels (0 or 1)
	X = data['text']
	y = data['label']

	# Create a TF-IDF vectorizer with Hebrew tokenizer
	vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True)

	# Fit and transform the data with TF-IDF vectorizer
	X_tfidf = vectorizer.fit_transform(X)

	# Split data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=47)

	# Create a Support Vector Machine (SVM) classifier
	classifier = SVC(kernel='linear', C=0.5, probability=True)

	# Train the SVM classifier on the training data
	classifier.fit(X_train, y_train)

	# Evaluate the model on the test data
	y_pred = classifier.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	precision = precision_score(y_test, y_pred)
	recall = recall_score(y_test, y_pred)
	f1 = f1_score(y_test, y_pred)

	print("Accuracy:", accuracy)
	print("Precision:", precision)
	print("Recall:", recall)
	print("F1 Score:", f1)

	# Save the trained model and vectorizer to files
	model_filename = "is_this_bible_model.pkl"
	vectorizer_filename = "is_this_bible_vectorizer.pkl"
	joblib.dump(classifier, model_filename)
	joblib.dump(vectorizer, vectorizer_filename)