fake-news-detector-LSTM-GloVe / preprocessing.py
kimic's picture
Initial commit for LSTM with GloVe embeddings
6f9bfc0
import re
import spacy
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import pickle
import numpy as np
# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)"
text = re.sub(
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text
)
# Remove patterns like "Featured image via author name / image place"
text = re.sub(r"Featured image via .+?\.($|\s)", "", text)
# Process text with spaCy
doc = nlp(text)
lemmatized_text = []
for token in doc:
# Preserve named entities in their original form
if token.ent_type_:
lemmatized_text.append(token.text)
# Lemmatize other tokens and exclude non-alpha tokens if necessary
elif token.is_alpha and not token.is_stop:
lemmatized_text.append(token.lemma_.lower())
return " ".join(lemmatized_text)
def load_tokenizer(tokenizer_path):
with open(tokenizer_path, "rb") as handle:
tokenizer = pickle.load(handle)
return tokenizer
def prepare_data(texts, tokenizer, max_length=500):
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=max_length)
return padded
def load_glove_embeddings(glove_file, word_index, embedding_dim=100):
embeddings_index = {}
with open(glove_file, encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype="float32")
embeddings_index[word] = coefs
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix