nesv042 commited on
Commit
3d4a29a
1 Parent(s): 072cc83

Adding rhoknp package reference

Browse files
tokenization_deberta_v2_jumanpp_fast.py CHANGED
@@ -55,6 +55,7 @@ class JumanppPreTokenizer:
55
  "You need to install rhoknp to use JumanppPreTokenizer. "
56
  "See https://github.com/ku-nlp/rhoknp for installation."
57
  )
 
58
  self.jumanpp = rhoknp.Jumanpp()
59
 
60
  def pre_tokenize(self, pretok: PreTokenizedString):
@@ -63,6 +64,6 @@ class JumanppPreTokenizer:
63
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
64
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
65
  if not offsets:
66
- doc = rhoknp.Document.from_raw_text(str(normalized_string))
67
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
68
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]
 
55
  "You need to install rhoknp to use JumanppPreTokenizer. "
56
  "See https://github.com/ku-nlp/rhoknp for installation."
57
  )
58
+ self.rhoknp = rhoknp
59
  self.jumanpp = rhoknp.Jumanpp()
60
 
61
  def pre_tokenize(self, pretok: PreTokenizedString):
 
64
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
65
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
66
  if not offsets:
67
+ doc = self.rhoknp.Document.from_raw_text(str(normalized_string))
68
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
69
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]