File size: 848 Bytes
5806e12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from segmentation import segment_batchalign
chunk = [
"once a horse met elephant and then they saw a ball",
"in a pool and then the horse tried to swim",
"and get the ball they might be the same",
]
def clean_text(text):
import re
return re.sub(r"[^\w\s]", "", text.lower()).strip()
word_sequence = []
gt_label_sequence = []
for row in chunk:
cleaned = clean_text(row)
words = cleaned.split()
word_sequence.extend(words)
gt_label_sequence.extend([0] * (len(words) - 1) + [1])
input_text = " ".join(word_sequence)
predicted_labels = segment_batchalign(input_text)
print("Word sequence:", input_text)
print("GT:", " ".join(map(str, gt_label_sequence)))
print("Pred:", " ".join(map(str, predicted_labels)))
print("Length match:", len(gt_label_sequence) == len(predicted_labels)) |