michal-stefanik's picture
Upload parse_czech_squad.py
e8fa15d
raw
history blame
3.95 kB
import csv
import json
import os.path
from typing import Iterable
import pandas as pd
data_folder = "data/czech-squad-v3"
shorten_to_sentences = 3
out_json = "data/czech_squad_%s-sents.json" % shorten_to_sentences
def read_first_entries(fpath: str, sep: str = "\t"):
line_collector = []
with open(fpath) as f:
for line in f.readlines():
entry = line.split(sep)[0]
line_collector.append(entry)
return line_collector
def collect_tokens(s: Iterable[str]) -> str:
out_str = ""
last_g = False
for i, token in enumerate(s):
token = token.strip()
if token is None:
raise ValueError("Token on position %s is None" % i)
if token == "<g/>":
last_g = True
continue
elif token.startswith("<") and token.endswith(">"):
continue
else:
if last_g:
out_str += token
last_g = False
else:
out_str += " %s" % token
return out_str.strip()
out_dict = {}
for i, folder in enumerate(os.listdir(data_folder)):
try:
question_f = os.path.join(data_folder, folder, "01question.vert")
question_list = read_first_entries(question_f)
question_str = collect_tokens(question_list)
# reformulated answer selection
# answer_f = os.path.join(data_folder, folder, "02answer.vert")
# answer_list = read_first_entries(answer_f)
# # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
# answer_str = collect_tokens(answer_list)
answer_f = os.path.join(data_folder, folder, "09answer_extraction.vert")
answer_list = read_first_entries(answer_f)
# answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
answer_str = collect_tokens(answer_list)
answer_str = answer_str.split(" # ")[0]
answer_type_f = os.path.join(data_folder, folder, "05metadata.txt")
answer_type = next(t for t in read_first_entries(answer_type_f) if "a_type" in t)
answer_type_cleaned = answer_type.replace("<a_type>", "").replace("</a_type>", "").strip()
text_f = os.path.join(data_folder, folder, "03text.vert")
text_list = read_first_entries(text_f)
# text_df = pd.read_csv(text_f, sep="\t", engine="python", error_bad_lines=False)
text_str = collect_tokens(text_list)
if answer_str.lower() not in text_str.lower():
print("Skipping answer %s: not present in context." % answer_str)
continue
if answer_str.endswith("."):
# to match in multi-sentence matching
answer_str = answer_str[:-1]
# maybe shorten to n-surrounding sentences
if shorten_to_sentences is not None:
sentences = text_str.split(". ")
answer_sentence_idx = next(i for i, _ in enumerate(sentences)
if all(a_segment.lower() in sentences[i+j].lower()
for j, a_segment in enumerate(answer_str.split(". "))))
shortened_context = sentences[max(0, answer_sentence_idx - shorten_to_sentences):
min(len(sentences), answer_sentence_idx + shorten_to_sentences)]
text_str = ". ".join(shortened_context) + ". "
# TODO: squad-like format: https://huggingface.co/datasets/squad
out_dict[i] = {"id": folder.split("/")[-1],
"answer_type": answer_type_cleaned,
"context": text_str,
"question": question_str,
"answers": {"text": [answer_str]}
}
except NotADirectoryError as e:
print("Skipping %s: %s: %s" % (i, folder, e))
with open(out_json, "w") as out_f:
out_f.write(json.dumps(out_dict))
print("Done. Output json exported to %s" % out_json)