File size: 965 Bytes
28dc58b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
def get_data(conversations: str, movie_lines: str, max_len: int=64) -> list:
with open(conversations, 'r', encoding='iso-8859-1') as c:
conv = c.readlines()
with open(movie_lines, 'r', encoding='iso-8859-1') as l:
lines = l.readlines()
### splitting text using special lines
lines_dic = {}
for line in lines:
objects = line.split(" +++$+++ ")
lines_dic[objects[0]] = objects[-1]
### generate question answer pairs
pairs = []
for con in conv:
ids = eval(con.split(" +++$+++ ")[-1])
for i in range(len(ids)):
qa_pairs = []
if i == len(ids) - 1:
break
first = lines_dic[ids[i]].strip()
second = lines_dic[ids[i+1]].strip()
qa_pairs.append(' '.join(first.split()[:max_len]))
qa_pairs.append(' '.join(second.split()[:max_len]))
pairs.append(qa_pairs)
return pairs
|