kimic
/

fake-news-detector-LSTM-GloVe

Model card Files Files and versions Community

fake-news-detector-LSTM-GloVe / preprocessing.py

kimic's picture

Initial commit for LSTM with GloVe embeddings

6f9bfc0 7 months ago

history blame contribute delete

No virus

1.98 kB

	import re
	import spacy
	from keras.preprocessing.text import Tokenizer
	from keras_preprocessing.sequence import pad_sequences
	import pickle
	import numpy as np


	# Load spaCy's English model
	nlp = spacy.load("en_core_web_sm")


	def preprocess_text(text):
	# Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)"
	text = re.sub(
	r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-\|\(Reuters\))", "", text
	)

	# Remove patterns like "Featured image via author name / image place"
	text = re.sub(r"Featured image via .+?\.($\|\s)", "", text)

	# Process text with spaCy
	doc = nlp(text)

	lemmatized_text = []
	for token in doc:
	# Preserve named entities in their original form
	if token.ent_type_:
	lemmatized_text.append(token.text)
	# Lemmatize other tokens and exclude non-alpha tokens if necessary
	elif token.is_alpha and not token.is_stop:
	lemmatized_text.append(token.lemma_.lower())

	return " ".join(lemmatized_text)


	def load_tokenizer(tokenizer_path):
	with open(tokenizer_path, "rb") as handle:
	tokenizer = pickle.load(handle)
	return tokenizer


	def prepare_data(texts, tokenizer, max_length=500):
	sequences = tokenizer.texts_to_sequences(texts)
	padded = pad_sequences(sequences, maxlen=max_length)
	return padded


	def load_glove_embeddings(glove_file, word_index, embedding_dim=100):
	embeddings_index = {}
	with open(glove_file, encoding="utf8") as f:
	for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype="float32")
	embeddings_index[word] = coefs

	embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
	for word, i in word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
	embedding_matrix[i] = embedding_vector

	return embedding_matrix