File size: 3,915 Bytes

b4f8d67

import json
import os.path
from typing import Iterable

data_folder = "data/czech-squad-v3"

shorten_to_sentences = 4

out_json = "data/czech_squad_%s-sents.json" % shorten_to_sentences


def read_first_entries(fpath: str, sep: str = "\t"):
    line_collector = []

    with open(fpath) as f:
        for line in f.readlines():
            entry = line.split(sep)[0]
            line_collector.append(entry)

    return line_collector


def collect_tokens(s: Iterable[str]) -> str:
    out_str = ""
    last_g = False
    for i, token in enumerate(s):
        token = token.strip()
        if token is None:
            raise ValueError("Token on position %s is None" % i)
        if token == "<g/>":
            last_g = True
            continue
        elif token.startswith("<") and token.endswith(">"):
            continue
        else:
            if last_g:
                out_str += token
                last_g = False
            else:
                out_str += " %s" % token
    return out_str.strip()


out_dict = {}

for i, folder in enumerate(os.listdir(data_folder)):
    try:
        question_f = os.path.join(data_folder, folder, "01question.vert")
        question_list = read_first_entries(question_f)
        question_str = collect_tokens(question_list)

        # reformulated answer selection
        # answer_f = os.path.join(data_folder, folder, "02answer.vert")
        # answer_list = read_first_entries(answer_f)
        # # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
        # answer_str = collect_tokens(answer_list)

        answer_f = os.path.join(data_folder, folder, "09answer_extraction.vert")
        answer_list = read_first_entries(answer_f)
        # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
        answer_str = collect_tokens(answer_list)
        answer_str = answer_str.split(" # ")[0]

        answer_type_f = os.path.join(data_folder, folder, "05metadata.txt")
        answer_type = next(t for t in read_first_entries(answer_type_f) if "a_type" in t)
        answer_type_cleaned = answer_type.replace("<a_type>", "").replace("</a_type>", "").strip()

        text_f = os.path.join(data_folder, folder, "03text.vert")
        text_list = read_first_entries(text_f)
        # text_df = pd.read_csv(text_f, sep="\t", engine="python", error_bad_lines=False)
        text_str = collect_tokens(text_list)

        if answer_str.lower() not in text_str.lower():
            print("Skipping answer %s: not present in context." % answer_str)
            continue

        if answer_str.endswith("."):
            # to match in multi-sentence matching
            answer_str = answer_str[:-1]

        # maybe shorten to n-surrounding sentences
        if shorten_to_sentences is not None:
            sentences = text_str.split(". ")
            answer_sentence_idx = next(i for i, _ in enumerate(sentences)
                                       if all(a_segment.lower() in sentences[i+j].lower()
                                              for j, a_segment in enumerate(answer_str.split(". "))))
            shortened_context = sentences[max(0, answer_sentence_idx - shorten_to_sentences):
                                          min(len(sentences), answer_sentence_idx + shorten_to_sentences)]

            text_str = ". ".join(shortened_context) + ". "

        # TODO: squad-like format: https://huggingface.co/datasets/squad
        out_dict[i] = {"id": folder.split("/")[-1],
                       "answer_type": answer_type_cleaned,
                       "context": text_str,
                       "question": question_str,
                       "answers": {"text": [answer_str]}
                       }

    except NotADirectoryError as e:
        print("Skipping %s: %s: %s" % (i, folder, e))

with open(out_json, "w") as out_f:
    out_f.write(json.dumps(out_dict))

print("Done. Output json exported to %s" % out_json)