Spaces:
Runtime error
Runtime error
import argparse | |
import yaml | |
import os | |
import pickle as pkl | |
from tqdm import tqdm | |
from pyarabic.araby import tokenize, strip_tashkeel, strip_tatweel | |
def export(path, text): | |
with open(path, 'w', encoding="utf-8") as fout: | |
fout.write('\n'.join(text)) | |
def segment(lines, stride, window_sz, min_window_sz): | |
segments, mapping = [], [] | |
real_seg_idx = 0 | |
for sent_idx, line in tqdm(enumerate(lines), total=len(lines)): | |
line: str = strip_tatweel(line) | |
line = line.strip() | |
tokens = tokenize(line) | |
if len(tokens) == 0: continue | |
if tokens[-1] == '\n': tokens = tokens[:-1] | |
seg_idx, idx = 0, 0 | |
while idx < len(tokens): | |
window = tokens[idx:idx+window_sz] | |
if window_sz == -1: window = tokens | |
if len(window) < min_window_sz and seg_idx != 0: break | |
segment = ' '.join(window) | |
segments += [segment] | |
char_offset = len(strip_tashkeel(' '.join(tokens[:idx]))) | |
if seg_idx > 0: | |
char_offset += 1 | |
seg_tokens = tokenize(strip_tashkeel(segment)) | |
j = 0 | |
for st_idx, st in enumerate(seg_tokens): | |
for _ in range(len(st)): | |
mapping += [(sent_idx, real_seg_idx, st_idx, j+char_offset)] | |
j += 1 | |
j += 1 | |
real_seg_idx += 1 | |
seg_idx += 1 | |
if stride == -1: break | |
idx += (window_sz if stride >= window_sz else stride) | |
return segments, mapping | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Sentence Breaker') | |
parser.add_argument('-c', '--config', type=str, | |
default="config.yaml", help='Run Configs') | |
parser.add_argument('-d', '--data_dir', type=str, | |
default=None, help='Override for data path') | |
args = parser.parse_args() | |
with open(args.config, 'r', encoding="utf-8") as file: | |
config = yaml.load(file, Loader=yaml.FullLoader) | |
BASE_PATH = args.data_dir or config["paths"].get("base") | |
stride = config["segment"]["stride"] | |
window = config["segment"]["window"] | |
min_window = config["segment"]["min-window"] | |
export_map = config["segment"]["export-map"] | |
for fpath in tqdm(config["segment"]["files"]): | |
FILE_PATH = os.path.join(BASE_PATH, fpath) | |
SAVE_PATH = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.txt") | |
MAP_PATH = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.map") | |
with open(FILE_PATH, 'r', encoding="utf-8") as fin: | |
lines = fin.readlines() | |
segments, mapping = segment(lines, stride, window, min_window) | |
with open(SAVE_PATH, 'w', encoding="utf-8") as fout: | |
fout.write('\n'.join(segments)) | |
if not export_map: continue | |
with open(MAP_PATH, 'w', encoding="utf-8") as fout: | |
for sent_idx, seg_idx, word_idx, char_idx in mapping: | |
fout.write(f"{sent_idx}, {seg_idx}, {word_idx}, {char_idx}\n") | |