File size: 2,870 Bytes
9bf0a0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from haystack import Label, MultiLabel, Answer
import json
import re

def read_labels(labels, tables):
    processed_labels = []
    for table in tables:
        if table.id not in labels:
            continue
        doc_labels = labels[table.id]
        for label in doc_labels:
            label = Label(
                query=label["question"],
                document=table,
                is_correct_answer=True,
                is_correct_document=True,
                answer=Answer(answer=label["answers"][0]["text"]),
                origin="gold-label",
            )
            processed_labels.append(MultiLabel(labels=[label]))
    return processed_labels

def create_labels(labels_file, data, seperate_eval):
  eval_labels = []
  with open(labels_file) as labels_file:
    labels = json.load(labels_file)
  if seperate_eval:
    use_labels = filter_labels(labels)
  else:
    use_labels = [labels]
  for l in use_labels:
    labels = []
    for d in data:
      labels += read_labels(l, d)
    print(f"Number of Labels: {len(labels)}")
    eval_labels.append(labels)
  return eval_labels

def get_processed_squad_labels(squad_labels):
  with open(f'./data/validation_data/{squad_labels}') as fp:
    squad_labels = json.load(fp)
  # Process Squad File by aligning the right document IDs for the course schedules
  processed_squad_labels = {}
  for paragraph in squad_labels["data"]:
    context = paragraph["paragraphs"][0]["context"]
    if context[:43] == "Code\tName\tEcts\tInstructor\tDays\tHours\tRooms\n":
      faculty_abb = re.search(r"[a-z]*", context[43:], re.IGNORECASE).group()
      if faculty_abb in processed_squad_labels:
        processed_squad_labels[faculty_abb].extend(paragraph["paragraphs"][0]["qas"])
      else:
        processed_squad_labels[faculty_abb] = paragraph["paragraphs"][0]["qas"]
    else:
      processed_squad_labels[str(paragraph["paragraphs"][0]["document_id"])] = paragraph["paragraphs"][0]["qas"]

  with open("./data/validation_data/processed_qa.json", "w") as outfile:
    json.dump(processed_squad_labels, outfile)
  #return processed_squad_labels
  
def filter_labels(labels):
  with open("./data/validation_data/questions_new.txt", "r") as fp:
    user_questions = fp.read()

  user_questions = user_questions.split("\n")
  user_questions = [qu.strip() for qu in user_questions]
  user_squad_labels = {}
  synthetic_squad_labels = {}
  for doc, questions in labels.items():
    for q in questions:
      if q["question"].strip() in user_questions:
        if doc in user_squad_labels:
          user_squad_labels[doc].append(q)
        else:
          user_squad_labels[doc] = [q]
      else:
        if doc in synthetic_squad_labels:
          synthetic_squad_labels[doc].append(q)
        else:
          synthetic_squad_labels[doc] = [q]
          
  return [user_squad_labels, synthetic_squad_labels]