nobu-g tealgreen0503 commited on
Commit
06e4a39
·
1 Parent(s): 89991d1

Fix handling of long text. (#3)

Browse files

- Fix exception handling in JumanppTokenizer and (514804743e0d9464e02cb292293a7e98990293b8)


Co-authored-by: KENGO SHIMIZU <tealgreen0503@users.noreply.huggingface.co>

tokenization_deberta_v2_jumanpp.py CHANGED
@@ -28,9 +28,9 @@ class JumanppTokenizer:
28
  self.jumanpp = rhoknp.Jumanpp()
29
 
30
  def tokenize(self, text: str) -> str:
31
- morphemes = self.jumanpp.apply_to_sentence(text).morphemes
32
- if not morphemes:
 
33
  doc = self.rhoknp.Document.from_raw_text(text)
34
  morphemes = self.jumanpp.apply_to_document(doc).morphemes
35
  return " ".join([morpheme.surf for morpheme in morphemes])
36
-
 
28
  self.jumanpp = rhoknp.Jumanpp()
29
 
30
  def tokenize(self, text: str) -> str:
31
+ try:
32
+ morphemes = self.jumanpp.apply_to_sentence(text).morphemes
33
+ except RuntimeError:
34
  doc = self.rhoknp.Document.from_raw_text(text)
35
  morphemes = self.jumanpp.apply_to_document(doc).morphemes
36
  return " ".join([morpheme.surf for morpheme in morphemes])
 
tokenization_deberta_v2_jumanpp_fast.py CHANGED
@@ -62,8 +62,9 @@ class JumanppPreTokenizer:
62
  pretok.split(self.jumanpp_split)
63
 
64
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
65
- offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
66
- if not offsets:
 
67
  doc = self.rhoknp.Document.from_raw_text(str(normalized_string))
68
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
69
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]
 
62
  pretok.split(self.jumanpp_split)
63
 
64
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
65
+ try:
66
+ offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
67
+ except RuntimeError:
68
  doc = self.rhoknp.Document.from_raw_text(str(normalized_string))
69
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
70
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]