Spaces:
Sleeping
Sleeping
File size: 2,819 Bytes
dbaa71b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import traceback
import logging
from typing import List, Any, Optional, Tuple
from obsei.payload import TextPayload
from obsei.preprocessor.base_preprocessor import (
BaseTextPreprocessor,
BaseTextProcessorConfig,
)
from obsei.preprocessor.text_cleaning_function import TextCleaningFunction, ToLowerCase, RemoveWhiteSpaceAndEmptyToken, \
RemovePunctuation, RemoveSpecialChars, DecodeUnicode, RemoveDateTime, ReplaceDomainKeywords, TokenStemming, \
RemoveStopWords
from obsei.preprocessor.text_tokenizer import BaseTextTokenizer, NLTKTextTokenizer
cleaner_logger: logging.Logger = logging.getLogger(__name__)
class TextCleanerConfig(BaseTextProcessorConfig):
cleaning_functions: Optional[List[TextCleaningFunction]] = None
stop_words_language: Optional[str] = "english"
stop_words: Optional[List[str]] = None
domain_keywords: Optional[Tuple[str, str]] = None
disable_tokenization: bool = False
def __init__(self, **data: Any):
super().__init__(**data)
if not self.cleaning_functions:
self.cleaning_functions = [
ToLowerCase(),
RemoveWhiteSpaceAndEmptyToken(),
RemovePunctuation(),
RemoveSpecialChars(),
DecodeUnicode(),
RemoveDateTime(),
ReplaceDomainKeywords(domain_keywords=self.domain_keywords),
TokenStemming(),
RemoveStopWords(
language=self.stop_words_language, stop_words=self.stop_words
),
RemoveWhiteSpaceAndEmptyToken(),
]
class TextCleaner(BaseTextPreprocessor):
text_tokenizer: Optional[BaseTextTokenizer] = None
def __init__(self, **data: Any):
super().__init__(**data)
self.text_tokenizer = self.text_tokenizer or NLTKTextTokenizer()
def preprocess_input( # type: ignore[override]
self,
input_list: List[TextPayload],
config: TextCleanerConfig,
**kwargs: Any,
) -> List[TextPayload]:
if config.cleaning_functions is None:
return input_list
for input_data in input_list:
if self.text_tokenizer is None or config.disable_tokenization:
tokens = [input_data.processed_text]
else:
tokens = self.text_tokenizer.tokenize_text(
input_data.processed_text
)
for cleaning_function in config.cleaning_functions:
try:
tokens = cleaning_function.execute(tokens)
except Exception as ex:
cleaner_logger.warning(f"Received exception: {ex}")
traceback.print_exc()
input_data.processed_text = " ".join(tokens)
return input_list
|