| from segmentation import segment_batchalign | |
| chunk = [ | |
| "once a horse met elephant and then they saw a ball", | |
| "in a pool and then the horse tried to swim", | |
| "and get the ball they might be the same", | |
| ] | |
| def clean_text(text): | |
| import re | |
| return re.sub(r"[^\w\s]", "", text.lower()).strip() | |
| word_sequence = [] | |
| gt_label_sequence = [] | |
| for row in chunk: | |
| cleaned = clean_text(row) | |
| words = cleaned.split() | |
| word_sequence.extend(words) | |
| gt_label_sequence.extend([0] * (len(words) - 1) + [1]) | |
| input_text = " ".join(word_sequence) | |
| predicted_labels = segment_batchalign(input_text) | |
| print("Word sequence:", input_text) | |
| print("GT:", " ".join(map(str, gt_label_sequence))) | |
| print("Pred:", " ".join(map(str, predicted_labels))) | |
| print("Length match:", len(gt_label_sequence) == len(predicted_labels)) |