Spaces:
Runtime error
Runtime error
nguyenvulebinh
commited on
Commit
•
4020980
1
Parent(s):
065a297
add chunk merging
Browse files
utils.py
CHANGED
@@ -139,20 +139,23 @@ def merge_two_chunk(chunk_1, chunk_2, overlap, debug=False):
|
|
139 |
idx_list_1.extend([idx] * len(extract_phrase_word(phrase)))
|
140 |
for idx, phrase in enumerate(list_2):
|
141 |
idx_list_2.extend([idx] * len(extract_phrase_word(phrase)))
|
|
|
142 |
for idx, (idx_1, idx_2) in enumerate(zip(idx_list_1, idx_list_2)):
|
143 |
if list_1[idx_1].startswith('<delete>') or list_2[idx_2].startswith('<delete>'):
|
144 |
continue
|
145 |
-
elif is_equal(list_1[idx_1], list_2[idx_2])
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
152 |
else:
|
153 |
combine_phrases.append(list_raw[idx])
|
154 |
mark_term_complete.extend(['1_{}'.format(idx_1), '2_{}'.format(idx_2)])
|
155 |
-
|
156 |
return combine_phrases
|
157 |
|
158 |
remain_1, compete_1 = extract_compete_region(chunk_1, is_head=True)
|
|
|
139 |
idx_list_1.extend([idx] * len(extract_phrase_word(phrase)))
|
140 |
for idx, phrase in enumerate(list_2):
|
141 |
idx_list_2.extend([idx] * len(extract_phrase_word(phrase)))
|
142 |
+
# print(idx_list_1, idx_list_2)
|
143 |
for idx, (idx_1, idx_2) in enumerate(zip(idx_list_1, idx_list_2)):
|
144 |
if list_1[idx_1].startswith('<delete>') or list_2[idx_2].startswith('<delete>'):
|
145 |
continue
|
146 |
+
elif is_equal(list_1[idx_1], list_2[idx_2]):
|
147 |
+
# print(list_1[idx_1])
|
148 |
+
if '1_{}'.format(idx_1) not in mark_term_complete and '2_{}'.format(idx_2) not in mark_term_complete:
|
149 |
+
if idx <= overlap//2:
|
150 |
+
combine_phrases.append(list_1[idx_1])
|
151 |
+
mark_term_complete.append('1_{}'.format(idx_1))
|
152 |
+
else:
|
153 |
+
combine_phrases.append(list_2[idx_2])
|
154 |
+
mark_term_complete.append('2_{}'.format(idx_2))
|
155 |
else:
|
156 |
combine_phrases.append(list_raw[idx])
|
157 |
mark_term_complete.extend(['1_{}'.format(idx_1), '2_{}'.format(idx_2)])
|
158 |
+
# print(mark_term_complete)
|
159 |
return combine_phrases
|
160 |
|
161 |
remain_1, compete_1 = extract_compete_region(chunk_1, is_head=True)
|