|
import re |
|
import spacy |
|
from keras.preprocessing.text import Tokenizer |
|
from keras_preprocessing.sequence import pad_sequences |
|
import pickle |
|
import numpy as np |
|
|
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
def preprocess_text(text): |
|
|
|
text = re.sub( |
|
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text |
|
) |
|
|
|
|
|
text = re.sub(r"Featured image via .+?\.($|\s)", "", text) |
|
|
|
|
|
doc = nlp(text) |
|
|
|
lemmatized_text = [] |
|
for token in doc: |
|
|
|
if token.ent_type_: |
|
lemmatized_text.append(token.text) |
|
|
|
elif token.is_alpha and not token.is_stop: |
|
lemmatized_text.append(token.lemma_.lower()) |
|
|
|
return " ".join(lemmatized_text) |
|
|
|
|
|
def load_tokenizer(tokenizer_path): |
|
with open(tokenizer_path, "rb") as handle: |
|
tokenizer = pickle.load(handle) |
|
return tokenizer |
|
|
|
|
|
def prepare_data(texts, tokenizer, max_length=500): |
|
sequences = tokenizer.texts_to_sequences(texts) |
|
padded = pad_sequences(sequences, maxlen=max_length) |
|
return padded |
|
|
|
|
|
def load_glove_embeddings(glove_file, word_index, embedding_dim=100): |
|
embeddings_index = {} |
|
with open(glove_file, encoding="utf8") as f: |
|
for line in f: |
|
values = line.split() |
|
word = values[0] |
|
coefs = np.asarray(values[1:], dtype="float32") |
|
embeddings_index[word] = coefs |
|
|
|
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim)) |
|
for word, i in word_index.items(): |
|
embedding_vector = embeddings_index.get(word) |
|
if embedding_vector is not None: |
|
embedding_matrix[i] = embedding_vector |
|
|
|
return embedding_matrix |
|
|