File size: 1,698 Bytes
46df0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import utils
from sklearn.feature_extraction.text import CountVectorizer

logs = utils.prepare_logging(__file__)

TEXT = "text"
TOKENIZED_TEXT = "tokenized_text"


class Tokenize:

    def __init__(self, text_dset, feature=TEXT, tok_feature=TOKENIZED_TEXT,
                 lowercase=True):
        self.text_dset = text_dset
        self.feature = feature
        self.tok_feature = tok_feature
        self.lowercase = lowercase
        # Pattern for tokenization
        self.cvec = CountVectorizer(token_pattern="(?u)\\b\\w+\\b",
                                    lowercase=lowercase)
        self.tokenized_dset = self.do_tokenization()

    def do_tokenization(self):
        """
        Tokenizes a Hugging Face dataset in the self.feature field.
        :return: Hugging Face Dataset with tokenized text in self.tok_feature.
        """
        sent_tokenizer = self.cvec.build_tokenizer()

        def tokenize_batch(examples):
            if self.lowercase:
                tok_sent = {
                    self.tok_feature: [tuple(sent_tokenizer(text.lower())) for
                                       text in examples[self.feature]]}
            else:
                tok_sent = {
                    self.tok_feature: [tuple(sent_tokenizer(text)) for text in
                                       examples[self.feature]]}
            return tok_sent

        tokenized_dset = self.text_dset.map(
            tokenize_batch,
            batched=True
        )
        logs.info("Tokenized the dataset.")
        return tokenized_dset

    def get(self):
        return self.tokenized_dset

    def get_df(self):
        return pd.DataFrame(self.tokenized_dset)