Epoching commited on
Commit
961cf08
1 Parent(s): 6318714

Update DiT_Extractor/sentence_extractor.py

Browse files
DiT_Extractor/sentence_extractor.py CHANGED
@@ -95,6 +95,11 @@ def sentence_extract(document):
95
  for sentence in sentences:
96
  t += len(sentence)
97
  if t <= max_tokens:
 
 
 
 
 
98
  word_section += sentence
99
  else:
100
  word_sections.append(word_section)
 
95
  for sentence in sentences:
96
  t += len(sentence)
97
  if t <= max_tokens:
98
+ # update character indicies from concatenating sentences
99
+ if len(word_section) > 0:
100
+ last_word_obj = word_section[-1]
101
+ _, (_, char_idx_offset), _ = last_word_obj
102
+ sentence = [(w, (sc+char_idx_offset+1, ec+char_idx_offset+1), bbox) for w, (sc, ec), bbox in sentence]
103
  word_section += sentence
104
  else:
105
  word_sections.append(word_section)