File size: 1,338 Bytes
c5cd586
 
 
 
 
 
 
 
64c01a0
c5cd586
 
 
64c01a0
 
 
 
c5cd586
 
64c01a0
c5cd586
 
 
 
 
 
 
 
 
 
 
 
 
64c01a0
c5cd586
 
 
64c01a0
c5cd586
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
import spacy
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import pickle


# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")


def preprocess_text(text):
    # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)"
    text = re.sub(
        r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text
    )

    # Remove patterns like "Featured image via author name / image place"
    text = re.sub(r"Featured image via .+?\.($|\s)", "", text)

    # Process text with spaCy
    doc = nlp(text)

    lemmatized_text = []
    for token in doc:
        # Preserve named entities in their original form
        if token.ent_type_:
            lemmatized_text.append(token.text)
        # Lemmatize other tokens and exclude non-alpha tokens if necessary
        elif token.is_alpha and not token.is_stop:
            lemmatized_text.append(token.lemma_.lower())

    return " ".join(lemmatized_text)


def load_tokenizer(tokenizer_path):
    with open(tokenizer_path, "rb") as handle:
        tokenizer = pickle.load(handle)
    return tokenizer


def prepare_data(texts, tokenizer, max_length=500):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_length)
    return padded