import re | |
import spacy | |
from keras.preprocessing.text import Tokenizer | |
from keras_preprocessing.sequence import pad_sequences | |
import pickle | |
# Load spaCy's English model | |
nlp = spacy.load("en_core_web_sm") | |
def preprocess_text(text): | |
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)" | |
text = re.sub( | |
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text | |
) | |
# Remove patterns like "Featured image via author name / image place" | |
text = re.sub(r"Featured image via .+?\.($|\s)", "", text) | |
# Process text with spaCy | |
doc = nlp(text) | |
lemmatized_text = [] | |
for token in doc: | |
# Preserve named entities in their original form | |
if token.ent_type_: | |
lemmatized_text.append(token.text) | |
# Lemmatize other tokens and exclude non-alpha tokens if necessary | |
elif token.is_alpha and not token.is_stop: | |
lemmatized_text.append(token.lemma_.lower()) | |
return " ".join(lemmatized_text) | |
def load_tokenizer(tokenizer_path): | |
with open(tokenizer_path, "rb") as handle: | |
tokenizer = pickle.load(handle) | |
return tokenizer | |
def prepare_data(texts, tokenizer, max_length=500): | |
sequences = tokenizer.texts_to_sequences(texts) | |
padded = pad_sequences(sequences, maxlen=max_length) | |
return padded | |