NLP-Legal-Texts / util /tokenizer.py
Daniel Steinigen
add demonstrator
a50f42c
raw
history blame
No virus
825 Bytes
from typing import List
import spacy
from util.process_data import Token, Sample, SampleList
class Tokenizer():
def __init__(self, spacy_model: str):
self.__spacy_model = spacy.load(spacy_model)
def run(self, sample_list: SampleList):
self.__tokenize(sample_list.samples, self.__spacy_model)
def __tokenize(self, samples: List[Sample], spacy_model):
doc_pipe = spacy_model.pipe([sample.text.replace('\xa0', ' ') for sample in samples])
for sample, doc in zip(samples, doc_pipe):
sample.tokens = [Token(
text=x.text,
start=x.idx,
end=x.idx + len(x.text)
) for x in doc]
while '\n' in sample.tokens[-1].text or ' ' in sample.tokens[-1].text:
sample.tokens = sample.tokens[:-1]