nguyenvulebinh commited on
Commit
4020980
1 Parent(s): 065a297

add chunk merging

Browse files
Files changed (1) hide show
  1. utils.py +11 -8
utils.py CHANGED
@@ -139,20 +139,23 @@ def merge_two_chunk(chunk_1, chunk_2, overlap, debug=False):
139
  idx_list_1.extend([idx] * len(extract_phrase_word(phrase)))
140
  for idx, phrase in enumerate(list_2):
141
  idx_list_2.extend([idx] * len(extract_phrase_word(phrase)))
 
142
  for idx, (idx_1, idx_2) in enumerate(zip(idx_list_1, idx_list_2)):
143
  if list_1[idx_1].startswith('<delete>') or list_2[idx_2].startswith('<delete>'):
144
  continue
145
- elif is_equal(list_1[idx_1], list_2[idx_2]) and '1_{}'.format(idx_1) not in mark_term_complete and '2_{}'.format(idx_2) not in mark_term_complete:
146
- if idx <= overlap//2:
147
- combine_phrases.append(list_1[idx_1])
148
- mark_term_complete.append('1_{}'.format(idx_1))
149
- else:
150
- combine_phrases.append(list_2[idx_2])
151
- mark_term_complete.append('2_{}'.format(idx_2))
 
 
152
  else:
153
  combine_phrases.append(list_raw[idx])
154
  mark_term_complete.extend(['1_{}'.format(idx_1), '2_{}'.format(idx_2)])
155
-
156
  return combine_phrases
157
 
158
  remain_1, compete_1 = extract_compete_region(chunk_1, is_head=True)
 
139
  idx_list_1.extend([idx] * len(extract_phrase_word(phrase)))
140
  for idx, phrase in enumerate(list_2):
141
  idx_list_2.extend([idx] * len(extract_phrase_word(phrase)))
142
+ # print(idx_list_1, idx_list_2)
143
  for idx, (idx_1, idx_2) in enumerate(zip(idx_list_1, idx_list_2)):
144
  if list_1[idx_1].startswith('<delete>') or list_2[idx_2].startswith('<delete>'):
145
  continue
146
+ elif is_equal(list_1[idx_1], list_2[idx_2]):
147
+ # print(list_1[idx_1])
148
+ if '1_{}'.format(idx_1) not in mark_term_complete and '2_{}'.format(idx_2) not in mark_term_complete:
149
+ if idx <= overlap//2:
150
+ combine_phrases.append(list_1[idx_1])
151
+ mark_term_complete.append('1_{}'.format(idx_1))
152
+ else:
153
+ combine_phrases.append(list_2[idx_2])
154
+ mark_term_complete.append('2_{}'.format(idx_2))
155
  else:
156
  combine_phrases.append(list_raw[idx])
157
  mark_term_complete.extend(['1_{}'.format(idx_1), '2_{}'.format(idx_2)])
158
+ # print(mark_term_complete)
159
  return combine_phrases
160
 
161
  remain_1, compete_1 = extract_compete_region(chunk_1, is_head=True)