File size: 2,819 Bytes
dbaa71b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import traceback
import logging
from typing import List, Any, Optional, Tuple

from obsei.payload import TextPayload
from obsei.preprocessor.base_preprocessor import (
    BaseTextPreprocessor,
    BaseTextProcessorConfig,
)
from obsei.preprocessor.text_cleaning_function import TextCleaningFunction, ToLowerCase, RemoveWhiteSpaceAndEmptyToken, \
    RemovePunctuation, RemoveSpecialChars, DecodeUnicode, RemoveDateTime, ReplaceDomainKeywords, TokenStemming, \
    RemoveStopWords
from obsei.preprocessor.text_tokenizer import BaseTextTokenizer, NLTKTextTokenizer

cleaner_logger: logging.Logger = logging.getLogger(__name__)


class TextCleanerConfig(BaseTextProcessorConfig):
    cleaning_functions: Optional[List[TextCleaningFunction]] = None
    stop_words_language: Optional[str] = "english"
    stop_words: Optional[List[str]] = None
    domain_keywords: Optional[Tuple[str, str]] = None
    disable_tokenization: bool = False

    def __init__(self, **data: Any):
        super().__init__(**data)

        if not self.cleaning_functions:
            self.cleaning_functions = [
                ToLowerCase(),
                RemoveWhiteSpaceAndEmptyToken(),
                RemovePunctuation(),
                RemoveSpecialChars(),
                DecodeUnicode(),
                RemoveDateTime(),
                ReplaceDomainKeywords(domain_keywords=self.domain_keywords),
                TokenStemming(),
                RemoveStopWords(
                    language=self.stop_words_language, stop_words=self.stop_words
                ),
                RemoveWhiteSpaceAndEmptyToken(),
            ]


class TextCleaner(BaseTextPreprocessor):
    text_tokenizer: Optional[BaseTextTokenizer] = None

    def __init__(self, **data: Any):
        super().__init__(**data)
        self.text_tokenizer = self.text_tokenizer or NLTKTextTokenizer()

    def preprocess_input(  # type: ignore[override]
        self,
        input_list: List[TextPayload],
        config: TextCleanerConfig,
        **kwargs: Any,
    ) -> List[TextPayload]:
        if config.cleaning_functions is None:
            return input_list
        for input_data in input_list:
            if self.text_tokenizer is None or config.disable_tokenization:
                tokens = [input_data.processed_text]
            else:
                tokens = self.text_tokenizer.tokenize_text(
                    input_data.processed_text
                )
            for cleaning_function in config.cleaning_functions:
                try:
                    tokens = cleaning_function.execute(tokens)
                except Exception as ex:
                    cleaner_logger.warning(f"Received exception: {ex}")
                    traceback.print_exc()

            input_data.processed_text = " ".join(tokens)

        return input_list