from haystack import Label, MultiLabel, Answer import json import re def read_labels(labels, tables): processed_labels = [] for table in tables: if table.id not in labels: continue doc_labels = labels[table.id] for label in doc_labels: label = Label( query=label["question"], document=table, is_correct_answer=True, is_correct_document=True, answer=Answer(answer=label["answers"][0]["text"]), origin="gold-label", ) processed_labels.append(MultiLabel(labels=[label])) return processed_labels def create_labels(labels_file, data, seperate_eval): eval_labels = [] with open(labels_file) as labels_file: labels = json.load(labels_file) if seperate_eval: use_labels = filter_labels(labels) else: use_labels = [labels] for l in use_labels: labels = [] for d in data: labels += read_labels(l, d) print(f"Number of Labels: {len(labels)}") eval_labels.append(labels) return eval_labels def get_processed_squad_labels(squad_labels): with open(f'./data/validation_data/{squad_labels}') as fp: squad_labels = json.load(fp) # Process Squad File by aligning the right document IDs for the course schedules processed_squad_labels = {} for paragraph in squad_labels["data"]: context = paragraph["paragraphs"][0]["context"] if context[:43] == "Code\tName\tEcts\tInstructor\tDays\tHours\tRooms\n": faculty_abb = re.search(r"[a-z]*", context[43:], re.IGNORECASE).group() if faculty_abb in processed_squad_labels: processed_squad_labels[faculty_abb].extend(paragraph["paragraphs"][0]["qas"]) else: processed_squad_labels[faculty_abb] = paragraph["paragraphs"][0]["qas"] else: processed_squad_labels[str(paragraph["paragraphs"][0]["document_id"])] = paragraph["paragraphs"][0]["qas"] with open("./data/validation_data/processed_qa.json", "w") as outfile: json.dump(processed_squad_labels, outfile) #return processed_squad_labels def filter_labels(labels): with open("./data/validation_data/questions_new.txt", "r") as fp: user_questions = fp.read() user_questions = user_questions.split("\n") user_questions = [qu.strip() for qu in user_questions] user_squad_labels = {} synthetic_squad_labels = {} for doc, questions in labels.items(): for q in questions: if q["question"].strip() in user_questions: if doc in user_squad_labels: user_squad_labels[doc].append(q) else: user_squad_labels[doc] = [q] else: if doc in synthetic_squad_labels: synthetic_squad_labels[doc].append(q) else: synthetic_squad_labels[doc] = [q] return [user_squad_labels, synthetic_squad_labels]