michal-stefanik commited on
Commit
b4f8d67
1 Parent(s): b25e454

Upload parse_czech_squad.py

Browse files
Files changed (1) hide show
  1. parse_czech_squad.py +106 -0
parse_czech_squad.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os.path
3
+ from typing import Iterable
4
+
5
+ data_folder = "data/czech-squad-v3"
6
+
7
+ shorten_to_sentences = 4
8
+
9
+ out_json = "data/czech_squad_%s-sents.json" % shorten_to_sentences
10
+
11
+
12
+ def read_first_entries(fpath: str, sep: str = "\t"):
13
+ line_collector = []
14
+
15
+ with open(fpath) as f:
16
+ for line in f.readlines():
17
+ entry = line.split(sep)[0]
18
+ line_collector.append(entry)
19
+
20
+ return line_collector
21
+
22
+
23
+ def collect_tokens(s: Iterable[str]) -> str:
24
+ out_str = ""
25
+ last_g = False
26
+ for i, token in enumerate(s):
27
+ token = token.strip()
28
+ if token is None:
29
+ raise ValueError("Token on position %s is None" % i)
30
+ if token == "<g/>":
31
+ last_g = True
32
+ continue
33
+ elif token.startswith("<") and token.endswith(">"):
34
+ continue
35
+ else:
36
+ if last_g:
37
+ out_str += token
38
+ last_g = False
39
+ else:
40
+ out_str += " %s" % token
41
+ return out_str.strip()
42
+
43
+
44
+ out_dict = {}
45
+
46
+ for i, folder in enumerate(os.listdir(data_folder)):
47
+ try:
48
+ question_f = os.path.join(data_folder, folder, "01question.vert")
49
+ question_list = read_first_entries(question_f)
50
+ question_str = collect_tokens(question_list)
51
+
52
+ # reformulated answer selection
53
+ # answer_f = os.path.join(data_folder, folder, "02answer.vert")
54
+ # answer_list = read_first_entries(answer_f)
55
+ # # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
56
+ # answer_str = collect_tokens(answer_list)
57
+
58
+ answer_f = os.path.join(data_folder, folder, "09answer_extraction.vert")
59
+ answer_list = read_first_entries(answer_f)
60
+ # answer_df = pd.read_csv(answer_f, sep="\t", index_col=False)
61
+ answer_str = collect_tokens(answer_list)
62
+ answer_str = answer_str.split(" # ")[0]
63
+
64
+ answer_type_f = os.path.join(data_folder, folder, "05metadata.txt")
65
+ answer_type = next(t for t in read_first_entries(answer_type_f) if "a_type" in t)
66
+ answer_type_cleaned = answer_type.replace("<a_type>", "").replace("</a_type>", "").strip()
67
+
68
+ text_f = os.path.join(data_folder, folder, "03text.vert")
69
+ text_list = read_first_entries(text_f)
70
+ # text_df = pd.read_csv(text_f, sep="\t", engine="python", error_bad_lines=False)
71
+ text_str = collect_tokens(text_list)
72
+
73
+ if answer_str.lower() not in text_str.lower():
74
+ print("Skipping answer %s: not present in context." % answer_str)
75
+ continue
76
+
77
+ if answer_str.endswith("."):
78
+ # to match in multi-sentence matching
79
+ answer_str = answer_str[:-1]
80
+
81
+ # maybe shorten to n-surrounding sentences
82
+ if shorten_to_sentences is not None:
83
+ sentences = text_str.split(". ")
84
+ answer_sentence_idx = next(i for i, _ in enumerate(sentences)
85
+ if all(a_segment.lower() in sentences[i+j].lower()
86
+ for j, a_segment in enumerate(answer_str.split(". "))))
87
+ shortened_context = sentences[max(0, answer_sentence_idx - shorten_to_sentences):
88
+ min(len(sentences), answer_sentence_idx + shorten_to_sentences)]
89
+
90
+ text_str = ". ".join(shortened_context) + ". "
91
+
92
+ # TODO: squad-like format: https://huggingface.co/datasets/squad
93
+ out_dict[i] = {"id": folder.split("/")[-1],
94
+ "answer_type": answer_type_cleaned,
95
+ "context": text_str,
96
+ "question": question_str,
97
+ "answers": {"text": [answer_str]}
98
+ }
99
+
100
+ except NotADirectoryError as e:
101
+ print("Skipping %s: %s: %s" % (i, folder, e))
102
+
103
+ with open(out_json, "w") as out_f:
104
+ out_f.write(json.dumps(out_dict))
105
+
106
+ print("Done. Output json exported to %s" % out_json)