import re import spacy from keras.preprocessing.text import Tokenizer from keras_preprocessing.sequence import pad_sequences import pickle import numpy as np # Load spaCy's English model nlp = spacy.load("en_core_web_sm") def preprocess_text(text): # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)" text = re.sub( r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text ) # Remove patterns like "Featured image via author name / image place" text = re.sub(r"Featured image via .+?\.($|\s)", "", text) # Process text with spaCy doc = nlp(text) lemmatized_text = [] for token in doc: # Preserve named entities in their original form if token.ent_type_: lemmatized_text.append(token.text) # Lemmatize other tokens and exclude non-alpha tokens if necessary elif token.is_alpha and not token.is_stop: lemmatized_text.append(token.lemma_.lower()) return " ".join(lemmatized_text) def load_tokenizer(tokenizer_path): with open(tokenizer_path, "rb") as handle: tokenizer = pickle.load(handle) return tokenizer def prepare_data(texts, tokenizer, max_length=500): sequences = tokenizer.texts_to_sequences(texts) padded = pad_sequences(sequences, maxlen=max_length) return padded def load_glove_embeddings(glove_file, word_index, embedding_dim=100): embeddings_index = {} with open(glove_file, encoding="utf8") as f: for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype="float32") embeddings_index[word] = coefs embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return embedding_matrix