nesv042 commited on
Commit
71e8994
1 Parent(s): 5fbcdbd

Fix jumanpp.apply_to_sentence return empty list for sentence larger than ~1700 characters

Browse files
tokenization_deberta_v2_jumanpp_fast.py CHANGED
@@ -62,4 +62,7 @@ class JumanppPreTokenizer:
62
 
63
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
64
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
 
 
 
65
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]
 
62
 
63
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
64
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
65
+ if not offsets:
66
+ doc = rhoknp.Document.from_raw_text(str(normalized_string))
67
+ offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
68
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]