import re import spacy from keras.preprocessing.text import Tokenizer from keras_preprocessing.sequence import pad_sequences import pickle # Load spaCy's English model nlp = spacy.load("en_core_web_sm") def preprocess_text(text): # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)" text = re.sub( r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text ) # Remove patterns like "Featured image via author name / image place" text = re.sub(r"Featured image via .+?\.($|\s)", "", text) # Process text with spaCy doc = nlp(text) lemmatized_text = [] for token in doc: # Preserve named entities in their original form if token.ent_type_: lemmatized_text.append(token.text) # Lemmatize other tokens and exclude non-alpha tokens if necessary elif token.is_alpha and not token.is_stop: lemmatized_text.append(token.lemma_.lower()) return " ".join(lemmatized_text) def load_tokenizer(tokenizer_path): with open(tokenizer_path, "rb") as handle: tokenizer = pickle.load(handle) return tokenizer def prepare_data(texts, tokenizer, max_length=500): sequences = tokenizer.texts_to_sequences(texts) padded = pad_sequences(sequences, maxlen=max_length) return padded