fake-news-detector-LSTM / preprocessing.py
kimic's picture
Updated preprocessing and inferenced on data_3
64c01a0
import re
import spacy
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import pickle
# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)"
text = re.sub(
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text
)
# Remove patterns like "Featured image via author name / image place"
text = re.sub(r"Featured image via .+?\.($|\s)", "", text)
# Process text with spaCy
doc = nlp(text)
lemmatized_text = []
for token in doc:
# Preserve named entities in their original form
if token.ent_type_:
lemmatized_text.append(token.text)
# Lemmatize other tokens and exclude non-alpha tokens if necessary
elif token.is_alpha and not token.is_stop:
lemmatized_text.append(token.lemma_.lower())
return " ".join(lemmatized_text)
def load_tokenizer(tokenizer_path):
with open(tokenizer_path, "rb") as handle:
tokenizer = pickle.load(handle)
return tokenizer
def prepare_data(texts, tokenizer, max_length=500):
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=max_length)
return padded