import argparse import yaml import os import pickle as pkl from tqdm import tqdm from pyarabic.araby import tokenize, strip_tashkeel, strip_tatweel def export(path, text): with open(path, 'w', encoding="utf-8") as fout: fout.write('\n'.join(text)) def segment(lines, stride, window_sz, min_window_sz): segments, mapping = [], [] real_seg_idx = 0 for sent_idx, line in tqdm(enumerate(lines), total=len(lines)): line: str = strip_tatweel(line) line = line.strip() tokens = tokenize(line) if len(tokens) == 0: continue if tokens[-1] == '\n': tokens = tokens[:-1] seg_idx, idx = 0, 0 while idx < len(tokens): window = tokens[idx:idx+window_sz] if window_sz == -1: window = tokens if len(window) < min_window_sz and seg_idx != 0: break segment = ' '.join(window) segments += [segment] char_offset = len(strip_tashkeel(' '.join(tokens[:idx]))) if seg_idx > 0: char_offset += 1 seg_tokens = tokenize(strip_tashkeel(segment)) j = 0 for st_idx, st in enumerate(seg_tokens): for _ in range(len(st)): mapping += [(sent_idx, real_seg_idx, st_idx, j+char_offset)] j += 1 j += 1 real_seg_idx += 1 seg_idx += 1 if stride == -1: break idx += (window_sz if stride >= window_sz else stride) return segments, mapping if __name__ == "__main__": parser = argparse.ArgumentParser(description='Sentence Breaker') parser.add_argument('-c', '--config', type=str, default="config.yaml", help='Run Configs') parser.add_argument('-d', '--data_dir', type=str, default=None, help='Override for data path') args = parser.parse_args() with open(args.config, 'r', encoding="utf-8") as file: config = yaml.load(file, Loader=yaml.FullLoader) BASE_PATH = args.data_dir or config["paths"].get("base") stride = config["segment"]["stride"] window = config["segment"]["window"] min_window = config["segment"]["min-window"] export_map = config["segment"]["export-map"] for fpath in tqdm(config["segment"]["files"]): FILE_PATH = os.path.join(BASE_PATH, fpath) SAVE_PATH = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.txt") MAP_PATH = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.map") with open(FILE_PATH, 'r', encoding="utf-8") as fin: lines = fin.readlines() segments, mapping = segment(lines, stride, window, min_window) with open(SAVE_PATH, 'w', encoding="utf-8") as fout: fout.write('\n'.join(segments)) if not export_map: continue with open(MAP_PATH, 'w', encoding="utf-8") as fout: for sent_idx, seg_idx, word_idx, char_idx in mapping: fout.write(f"{sent_idx}, {seg_idx}, {word_idx}, {char_idx}\n")