xlm-roberta-large_extractive-QA_en-cs / parse_czech_squad.py

Upload parse_czech_squad.py

e8fa15d over 2 years ago

3.95 kB

	import csv
	import json
	import os.path
	from typing import Iterable

	import pandas as pd

	data_folder = "data/czech-squad-v3"

	shorten_to_sentences = 3

	out_json = "data/czech_squad_%s-sents.json" % shorten_to_sentences


	def read_first_entries(fpath: str, sep: str = "\t"):
	line_collector = []

	with open(fpath) as f:
	for line in f.readlines():
	entry = line.split(sep)[0]
	line_collector.append(entry)

	return line_collector


	def collect_tokens(s: Iterable[str]) -> str:
	out_str = ""
	last_g = False
	for i, token in enumerate(s):
	token = token.strip()
	if token is None:
	raise ValueError("Token on position %s is None" % i)
	if token == "<g/>":
	last_g = True
	continue
	elif token.startswith("<") and token.endswith(">"):
	continue
	else:
	if last_g:
	out_str += token
	last_g = False
	else:
	out_str += " %s" % token
	return out_str.strip()


	out_dict = {}

	for i, folder in enumerate(os.listdir(data_folder)):
	try:
	question_f = os.path.join(data_folder, folder, "01question.vert")
	question_list = read_first_entries(question_f)
	question_str = collect_tokens(question_list)

	# reformulated answer selection
	# answer_f = os.path.join(data_folder, folder, "02answer.vert")
	# answer_list = read_first_entries(answer_f)
	# # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
	# answer_str = collect_tokens(answer_list)

	answer_f = os.path.join(data_folder, folder, "09answer_extraction.vert")
	answer_list = read_first_entries(answer_f)
	# answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
	answer_str = collect_tokens(answer_list)
	answer_str = answer_str.split(" # ")[0]

	answer_type_f = os.path.join(data_folder, folder, "05metadata.txt")
	answer_type = next(t for t in read_first_entries(answer_type_f) if "a_type" in t)
	answer_type_cleaned = answer_type.replace("<a_type>", "").replace("</a_type>", "").strip()

	text_f = os.path.join(data_folder, folder, "03text.vert")
	text_list = read_first_entries(text_f)
	# text_df = pd.read_csv(text_f, sep="\t", engine="python", error_bad_lines=False)
	text_str = collect_tokens(text_list)

	if answer_str.lower() not in text_str.lower():
	print("Skipping answer %s: not present in context." % answer_str)
	continue

	if answer_str.endswith("."):
	# to match in multi-sentence matching
	answer_str = answer_str[:-1]

	# maybe shorten to n-surrounding sentences
	if shorten_to_sentences is not None:
	sentences = text_str.split(". ")
	answer_sentence_idx = next(i for i, _ in enumerate(sentences)
	if all(a_segment.lower() in sentences[i+j].lower()
	for j, a_segment in enumerate(answer_str.split(". "))))
	shortened_context = sentences[max(0, answer_sentence_idx - shorten_to_sentences):
	min(len(sentences), answer_sentence_idx + shorten_to_sentences)]

	text_str = ". ".join(shortened_context) + ". "

	# TODO: squad-like format: https://huggingface.co/datasets/squad
	out_dict[i] = {"id": folder.split("/")[-1],
	"answer_type": answer_type_cleaned,
	"context": text_str,
	"question": question_str,
	"answers": {"text": [answer_str]}
	}

	except NotADirectoryError as e:
	print("Skipping %s: %s: %s" % (i, folder, e))

	with open(out_json, "w") as out_f:
	out_f.write(json.dumps(out_dict))

	print("Done. Output json exported to %s" % out_json)