File size: 1,978 Bytes

6f9bfc0

import re
import spacy
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import pickle
import numpy as np


# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")


def preprocess_text(text):
    # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)"
    text = re.sub(
        r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text
    )

    # Remove patterns like "Featured image via author name / image place"
    text = re.sub(r"Featured image via .+?\.($|\s)", "", text)

    # Process text with spaCy
    doc = nlp(text)

    lemmatized_text = []
    for token in doc:
        # Preserve named entities in their original form
        if token.ent_type_:
            lemmatized_text.append(token.text)
        # Lemmatize other tokens and exclude non-alpha tokens if necessary
        elif token.is_alpha and not token.is_stop:
            lemmatized_text.append(token.lemma_.lower())

    return " ".join(lemmatized_text)


def load_tokenizer(tokenizer_path):
    with open(tokenizer_path, "rb") as handle:
        tokenizer = pickle.load(handle)
    return tokenizer


def prepare_data(texts, tokenizer, max_length=500):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_length)
    return padded


def load_glove_embeddings(glove_file, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix