import json import os.path from typing import Iterable data_folder = "data/czech-squad-v3" shorten_to_sentences = 4 out_json = "data/czech_squad_%s-sents.json" % shorten_to_sentences def read_first_entries(fpath: str, sep: str = "\t"): line_collector = [] with open(fpath) as f: for line in f.readlines(): entry = line.split(sep)[0] line_collector.append(entry) return line_collector def collect_tokens(s: Iterable[str]) -> str: out_str = "" last_g = False for i, token in enumerate(s): token = token.strip() if token is None: raise ValueError("Token on position %s is None" % i) if token == "": last_g = True continue elif token.startswith("<") and token.endswith(">"): continue else: if last_g: out_str += token last_g = False else: out_str += " %s" % token return out_str.strip() out_dict = {} for i, folder in enumerate(os.listdir(data_folder)): try: question_f = os.path.join(data_folder, folder, "01question.vert") question_list = read_first_entries(question_f) question_str = collect_tokens(question_list) # reformulated answer selection # answer_f = os.path.join(data_folder, folder, "02answer.vert") # answer_list = read_first_entries(answer_f) # # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False) # answer_str = collect_tokens(answer_list) answer_f = os.path.join(data_folder, folder, "09answer_extraction.vert") answer_list = read_first_entries(answer_f) # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False) answer_str = collect_tokens(answer_list) answer_str = answer_str.split(" # ")[0] answer_type_f = os.path.join(data_folder, folder, "05metadata.txt") answer_type = next(t for t in read_first_entries(answer_type_f) if "a_type" in t) answer_type_cleaned = answer_type.replace("", "").replace("", "").strip() text_f = os.path.join(data_folder, folder, "03text.vert") text_list = read_first_entries(text_f) # text_df = pd.read_csv(text_f, sep="\t", engine="python", error_bad_lines=False) text_str = collect_tokens(text_list) if answer_str.lower() not in text_str.lower(): print("Skipping answer %s: not present in context." % answer_str) continue if answer_str.endswith("."): # to match in multi-sentence matching answer_str = answer_str[:-1] # maybe shorten to n-surrounding sentences if shorten_to_sentences is not None: sentences = text_str.split(". ") answer_sentence_idx = next(i for i, _ in enumerate(sentences) if all(a_segment.lower() in sentences[i+j].lower() for j, a_segment in enumerate(answer_str.split(". ")))) shortened_context = sentences[max(0, answer_sentence_idx - shorten_to_sentences): min(len(sentences), answer_sentence_idx + shorten_to_sentences)] text_str = ". ".join(shortened_context) + ". " # TODO: squad-like format: https://huggingface.co/datasets/squad out_dict[i] = {"id": folder.split("/")[-1], "answer_type": answer_type_cleaned, "context": text_str, "question": question_str, "answers": {"text": [answer_str]} } except NotADirectoryError as e: print("Skipping %s: %s: %s" % (i, folder, e)) with open(out_json, "w") as out_f: out_f.write(json.dumps(out_dict)) print("Done. Output json exported to %s" % out_json)