|
import csv |
|
import json |
|
import os.path |
|
from typing import Iterable |
|
|
|
import pandas as pd |
|
|
|
data_folder = "data/czech-squad-v3" |
|
|
|
shorten_to_sentences = 3 |
|
|
|
out_json = "data/czech_squad_%s-sents.json" % shorten_to_sentences |
|
|
|
|
|
def read_first_entries(fpath: str, sep: str = "\t"): |
|
line_collector = [] |
|
|
|
with open(fpath) as f: |
|
for line in f.readlines(): |
|
entry = line.split(sep)[0] |
|
line_collector.append(entry) |
|
|
|
return line_collector |
|
|
|
|
|
def collect_tokens(s: Iterable[str]) -> str: |
|
out_str = "" |
|
last_g = False |
|
for i, token in enumerate(s): |
|
token = token.strip() |
|
if token is None: |
|
raise ValueError("Token on position %s is None" % i) |
|
if token == "<g/>": |
|
last_g = True |
|
continue |
|
elif token.startswith("<") and token.endswith(">"): |
|
continue |
|
else: |
|
if last_g: |
|
out_str += token |
|
last_g = False |
|
else: |
|
out_str += " %s" % token |
|
return out_str.strip() |
|
|
|
|
|
out_dict = {} |
|
|
|
for i, folder in enumerate(os.listdir(data_folder)): |
|
try: |
|
question_f = os.path.join(data_folder, folder, "01question.vert") |
|
question_list = read_first_entries(question_f) |
|
question_str = collect_tokens(question_list) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
answer_f = os.path.join(data_folder, folder, "09answer_extraction.vert") |
|
answer_list = read_first_entries(answer_f) |
|
|
|
answer_str = collect_tokens(answer_list) |
|
answer_str = answer_str.split(" # ")[0] |
|
|
|
answer_type_f = os.path.join(data_folder, folder, "05metadata.txt") |
|
answer_type = next(t for t in read_first_entries(answer_type_f) if "a_type" in t) |
|
answer_type_cleaned = answer_type.replace("<a_type>", "").replace("</a_type>", "").strip() |
|
|
|
text_f = os.path.join(data_folder, folder, "03text.vert") |
|
text_list = read_first_entries(text_f) |
|
|
|
text_str = collect_tokens(text_list) |
|
|
|
if answer_str.lower() not in text_str.lower(): |
|
print("Skipping answer %s: not present in context." % answer_str) |
|
continue |
|
|
|
if answer_str.endswith("."): |
|
|
|
answer_str = answer_str[:-1] |
|
|
|
|
|
if shorten_to_sentences is not None: |
|
sentences = text_str.split(". ") |
|
answer_sentence_idx = next(i for i, _ in enumerate(sentences) |
|
if all(a_segment.lower() in sentences[i+j].lower() |
|
for j, a_segment in enumerate(answer_str.split(". ")))) |
|
shortened_context = sentences[max(0, answer_sentence_idx - shorten_to_sentences): |
|
min(len(sentences), answer_sentence_idx + shorten_to_sentences)] |
|
|
|
text_str = ". ".join(shortened_context) + ". " |
|
|
|
|
|
out_dict[i] = {"id": folder.split("/")[-1], |
|
"answer_type": answer_type_cleaned, |
|
"context": text_str, |
|
"question": question_str, |
|
"answers": {"text": [answer_str]} |
|
} |
|
|
|
except NotADirectoryError as e: |
|
print("Skipping %s: %s: %s" % (i, folder, e)) |
|
|
|
with open(out_json, "w") as out_f: |
|
out_f.write(json.dumps(out_dict)) |
|
|
|
print("Done. Output json exported to %s" % out_json) |
|
|