gabriel-p commited on
Commit
9ca4613
1 Parent(s): b0605aa

Update TrueCaser

Browse files
Files changed (1) hide show
  1. TrueCaser.py +8 -1
TrueCaser.py CHANGED
@@ -1,5 +1,6 @@
1
  import math
2
  import pickle
 
3
  import string
4
 
5
  from nltk.tokenize import word_tokenize
@@ -79,10 +80,16 @@ class TrueCaser(object):
79
  def first_token_case(raw):
80
  return raw.capitalize()
81
 
 
 
 
 
82
  def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
83
  tokens = word_tokenize(sentence)
84
  tokens_true_case = self.get_true_case_from_tokens(tokens, out_of_vocabulary_token_option)
85
- return self.detknzr.detokenize(tokens_true_case)
 
 
86
 
87
  def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="title"):
88
  tokens_true_case = []
 
1
  import math
2
  import pickle
3
+ import re
4
  import string
5
 
6
  from nltk.tokenize import word_tokenize
 
80
  def first_token_case(raw):
81
  return raw.capitalize()
82
 
83
+ @staticmethod
84
+ def upper_replacement(match):
85
+ return '. ' + match.group(0)[-1].upper()
86
+
87
  def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
88
  tokens = word_tokenize(sentence)
89
  tokens_true_case = self.get_true_case_from_tokens(tokens, out_of_vocabulary_token_option)
90
+ text = self.detknzr.detokenize(tokens_true_case)
91
+ text = re.sub(r' \. .', self.upper_replacement, text)
92
+ return text
93
 
94
  def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="title"):
95
  tokens_true_case = []