import re import spacy from keras.preprocessing.text import Tokenizer from keras_preprocessing.sequence import pad_sequences import pickle spacy.prefer_gpu() print("GPU is available:", spacy.prefer_gpu()) # Load spaCy's English model nlp = spacy.load('en_core_web_sm') def preprocess_text(text): # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" text = re.sub(r'\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-', '', text) # Remove patterns like "Featured image via author name / image place" text = re.sub(r'Featured image via .+ / .+', '', text) # Process text with spaCy doc = nlp(text) # Improved lemmatization lemmatized_text = [] for token in doc: # Preserve named entities in their original form if token.ent_type_: lemmatized_text.append(token.text) # Lemmatize other tokens and exclude non-alpha tokens if necessary elif token.is_alpha and not token.is_stop: lemmatized_text.append(token.lemma_.lower()) return ' '.join(lemmatized_text) def load_tokenizer(tokenizer_path): with open(tokenizer_path, 'rb') as handle: tokenizer = pickle.load(handle) return tokenizer def prepare_data(texts, tokenizer, max_length=500): sequences = tokenizer.texts_to_sequences(texts) padded = pad_sequences(sequences, maxlen=max_length) return padded