michal-stefanik commited on
Commit
e8fa15d
1 Parent(s): ac31af3

Upload parse_czech_squad.py

Browse files
Files changed (1) hide show
  1. parse_czech_squad.py +109 -0
parse_czech_squad.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import os.path
4
+ from typing import Iterable
5
+
6
+ import pandas as pd
7
+
8
+ data_folder = "data/czech-squad-v3"
9
+
10
+ shorten_to_sentences = 3
11
+
12
+ out_json = "data/czech_squad_%s-sents.json" % shorten_to_sentences
13
+
14
+
15
+ def read_first_entries(fpath: str, sep: str = "\t"):
16
+ line_collector = []
17
+
18
+ with open(fpath) as f:
19
+ for line in f.readlines():
20
+ entry = line.split(sep)[0]
21
+ line_collector.append(entry)
22
+
23
+ return line_collector
24
+
25
+
26
+ def collect_tokens(s: Iterable[str]) -> str:
27
+ out_str = ""
28
+ last_g = False
29
+ for i, token in enumerate(s):
30
+ token = token.strip()
31
+ if token is None:
32
+ raise ValueError("Token on position %s is None" % i)
33
+ if token == "<g/>":
34
+ last_g = True
35
+ continue
36
+ elif token.startswith("<") and token.endswith(">"):
37
+ continue
38
+ else:
39
+ if last_g:
40
+ out_str += token
41
+ last_g = False
42
+ else:
43
+ out_str += " %s" % token
44
+ return out_str.strip()
45
+
46
+
47
+ out_dict = {}
48
+
49
+ for i, folder in enumerate(os.listdir(data_folder)):
50
+ try:
51
+ question_f = os.path.join(data_folder, folder, "01question.vert")
52
+ question_list = read_first_entries(question_f)
53
+ question_str = collect_tokens(question_list)
54
+
55
+ # reformulated answer selection
56
+ # answer_f = os.path.join(data_folder, folder, "02answer.vert")
57
+ # answer_list = read_first_entries(answer_f)
58
+ # # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
59
+ # answer_str = collect_tokens(answer_list)
60
+
61
+ answer_f = os.path.join(data_folder, folder, "09answer_extraction.vert")
62
+ answer_list = read_first_entries(answer_f)
63
+ # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
64
+ answer_str = collect_tokens(answer_list)
65
+ answer_str = answer_str.split(" # ")[0]
66
+
67
+ answer_type_f = os.path.join(data_folder, folder, "05metadata.txt")
68
+ answer_type = next(t for t in read_first_entries(answer_type_f) if "a_type" in t)
69
+ answer_type_cleaned = answer_type.replace("<a_type>", "").replace("</a_type>", "").strip()
70
+
71
+ text_f = os.path.join(data_folder, folder, "03text.vert")
72
+ text_list = read_first_entries(text_f)
73
+ # text_df = pd.read_csv(text_f, sep="\t", engine="python", error_bad_lines=False)
74
+ text_str = collect_tokens(text_list)
75
+
76
+ if answer_str.lower() not in text_str.lower():
77
+ print("Skipping answer %s: not present in context." % answer_str)
78
+ continue
79
+
80
+ if answer_str.endswith("."):
81
+ # to match in multi-sentence matching
82
+ answer_str = answer_str[:-1]
83
+
84
+ # maybe shorten to n-surrounding sentences
85
+ if shorten_to_sentences is not None:
86
+ sentences = text_str.split(". ")
87
+ answer_sentence_idx = next(i for i, _ in enumerate(sentences)
88
+ if all(a_segment.lower() in sentences[i+j].lower()
89
+ for j, a_segment in enumerate(answer_str.split(". "))))
90
+ shortened_context = sentences[max(0, answer_sentence_idx - shorten_to_sentences):
91
+ min(len(sentences), answer_sentence_idx + shorten_to_sentences)]
92
+
93
+ text_str = ". ".join(shortened_context) + ". "
94
+
95
+ # TODO: squad-like format: https://huggingface.co/datasets/squad
96
+ out_dict[i] = {"id": folder.split("/")[-1],
97
+ "answer_type": answer_type_cleaned,
98
+ "context": text_str,
99
+ "question": question_str,
100
+ "answers": {"text": [answer_str]}
101
+ }
102
+
103
+ except NotADirectoryError as e:
104
+ print("Skipping %s: %s: %s" % (i, folder, e))
105
+
106
+ with open(out_json, "w") as out_f:
107
+ out_f.write(json.dumps(out_dict))
108
+
109
+ print("Done. Output json exported to %s" % out_json)