Spaces:
Runtime error
Runtime error
File size: 825 Bytes
a50f42c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
from typing import List
import spacy
from util.process_data import Token, Sample, SampleList
class Tokenizer():
def __init__(self, spacy_model: str):
self.__spacy_model = spacy.load(spacy_model)
def run(self, sample_list: SampleList):
self.__tokenize(sample_list.samples, self.__spacy_model)
def __tokenize(self, samples: List[Sample], spacy_model):
doc_pipe = spacy_model.pipe([sample.text.replace('\xa0', ' ') for sample in samples])
for sample, doc in zip(samples, doc_pipe):
sample.tokens = [Token(
text=x.text,
start=x.idx,
end=x.idx + len(x.text)
) for x in doc]
while '\n' in sample.tokens[-1].text or ' ' in sample.tokens[-1].text:
sample.tokens = sample.tokens[:-1]
|