diff --git a/app.py b/app.py index a699bc5b3c2e987102ca93e0ee28d601e0a93d02..3c780015b43c9ca0404c54df3b78a0f0358ef0cb 100644 --- a/app.py +++ b/app.py @@ -1,7 +1,170 @@ import gradio as gr +import scripts.simulate_interaction as si +import sys +import traceback +import pandas as pd -def greet(name): - return "Hello " + name + "!!" +# from tqdm import tqdm +from scripts.UBAR_code.interaction import UBAR_interact +from scripts.user_model_code.interaction import multiwoz_interact +from scripts.UBAR_code.interaction.UBAR_interact import bcolors -iface = gr.Interface(fn=greet, inputs="text", outputs="text") -iface.launch() \ No newline at end of file + +def instantiate_agents(): + + UBAR_checkpoint_path = "models/UBAR/experiments/distilgpt-2_sd11_lr0.0001_bs16_ga2/epoch50_trloss0.59_gpt2" + user_model_checkpoint_path = "models/user_model/MultiWOZ-full_checkpoint_step340k" + + sys_model = UBAR_interact.UbarSystemModel( + "UBAR_sys_model", UBAR_checkpoint_path, "scripts/UBAR_code/interaction/config.yaml" + ) + + user_model = multiwoz_interact.NeuralAgent( + "user", user_model_checkpoint_path, "scripts/user_model_code/interaction/config.yaml" + ) + + return sys_model, user_model + + +def read_multiwoz_data(): + """ + Read the multiwoz 2.0 raw data from the .json file + """ + raw_mwoz_20_path = "data/raw/UBAR/multi-woz/data.json" + df_raw_mwoz = pd.read_json(raw_mwoz_20_path) + return df_raw_mwoz + + +def load_test_val_lists(): + val_list_file = "data/raw/UBAR/multi-woz/valListFile.json" + test_list_file = "data/raw/UBAR/multi-woz/testListFile.json" + + with open(val_list_file, "r") as f: + val_list = f.readlines() + val_list = [x.strip() for x in val_list] + + with open(test_list_file, "r") as f: + test_list = f.readlines() + test_list = [x.strip() for x in test_list] + + return val_list, test_list + + +def main( + write_to_file=False, ground_truth_system_responses=False, train_only=True, n_dialogues="all", log_successes=False +): + sys_model, user_model = instantiate_agents() + + # TODO: move hardcoded vars into config file + raw_mwoz_20_path = "data/raw/UBAR/multi-woz/data.json" + user_utterances_out_path = "data/preprocessed/UBAR/user_utterances_from_simulator.txt" + logging_successes_path = "data/preprocessed/UBAR/logging_successes" + sys_model.print_intermediary_info = False + user_model.print_intermediary_info = False + + df_raw_mwoz = pd.read_json(raw_mwoz_20_path) + if n_dialogues == "all": + n_dialogues = len(df_raw_mwoz.columns) + + curr_dialogue_user_utterances_formatted = [] + + print("Loading goals...") + goals = multiwoz_interact.read_multiWOZ_20_goals(raw_mwoz_20_path, n_dialogues) + + # Write column headers + if write_to_file: + with open(user_utterances_out_path, "w") as f: + f.write("Dialogue #\tDialogue ID\tTurn #\tSystem Response\n") + + print("Loading data...") + df_mwoz_data = read_multiwoz_data() + val_list, test_list = load_test_val_lists() + + successful_dialogues = 0 + total_dialogues_generated = 0 # train dialogues only + for dialogue_idx, (goal, dialogue_filename) in enumerate(zip(goals, df_mwoz_data.columns)): + if log_successes: + # log successful_dialogues to logging_successes_path every 100 dialogues + if dialogue_idx % 100 == 0: + with open(logging_successes_path, "w") as f: + f.write(str(successful_dialogues) + " / " + str(total_dialogues_generated)) + + curr_dialogue_user_utterances_formatted = [] + if train_only: + if dialogue_filename in val_list or dialogue_filename in test_list: + continue + + total_dialogues_generated += 1 + print("Dialogue: {}".format(dialogue_filename)) + + # There are occasionally exceptions thrown from one of the agents, usually the user + # In this case we simply continue to the next dialogue + try: + # Reset state after each dialogue + sys_model.init_session() + user_model.init_session(ini_goal=goal) + sys_response = "" + + for turn_idx in range(50): + # Turn idx in this case represents the turn as one user utterance AND one system response + usr_response_raw_data_idx = turn_idx * 2 + sys_response_raw_data_idx = turn_idx * 2 + 1 + + user_utterance = user_model.response(sys_response) + print(bcolors.OKBLUE + "User: " + bcolors.ENDC + user_utterance) + + if write_to_file: + user_utterance = user_utterance.replace("\n", " ") + curr_dialogue_user_utterances_formatted.append( + str(dialogue_idx) + + "\t" + + dialogue_filename + + "\t" + + str(usr_response_raw_data_idx) + + "\t" + + user_utterance + + "\n" + ) + + if user_model.is_terminated(): + successful_dialogues += 1 + print(bcolors.OKCYAN + "Dialogue terminated successfully!" + bcolors.ENDC) + print(bcolors.OKCYAN + "---" * 30 + bcolors.ENDC + "\n") + if write_to_file: + # Write whole dialogue to file + with open(user_utterances_out_path, "a") as f: + for line in curr_dialogue_user_utterances_formatted: + f.write(line) + break + + # Next turn materials + if ground_truth_system_responses: + # If we are at the end of the ground truth dialogues + if len(df_mwoz_data.iloc[:, dialogue_idx].log) <= sys_response_raw_data_idx: + print(bcolors.RED + "Dialogue terminated unsuccessfully!" + bcolors.ENDC) + print(bcolors.RED + "---" * 30 + bcolors.ENDC + "\n") + break + sys_response = df_mwoz_data.iloc[:, dialogue_idx].log[sys_response_raw_data_idx]["text"] + else: + sys_response = sys_model.response(user_utterance, turn_idx) + capitalised_sys_response = sys_response[0].upper() + sys_response[1:] + print(bcolors.GREEN + "System: " + bcolors.ENDC + capitalised_sys_response) + + except Exception: + print(bcolors.RED + "*" * 30 + bcolors.ENDC) + print(bcolors.RED + "Error in dialogue {}".format(dialogue_filename) + bcolors.ENDC) + print(bcolors.RED + "*" * 30 + bcolors.ENDC) + traceback.print_exc() + continue + + print("Successful dialogues: {}".format(successful_dialogues)) + print("Total dialogues: {}".format(n_dialogues)) + print("% Successful Dialopues: {}".format(successful_dialogues / n_dialogues)) + + +def test(): + return "SUCCESS" + + +iface = gr.Interface(fn=test, outputs="text") +iface.launch() diff --git a/scripts/UBAR_code/__init__.py b/scripts/UBAR_code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/UBAR_code/data_analysis.py b/scripts/UBAR_code/data_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..0327d2eb581d3a9472f7bfc8fe1ff8cda4d671f2 --- /dev/null +++ b/scripts/UBAR_code/data_analysis.py @@ -0,0 +1,170 @@ +import copy +import json +import os +import re +import zipfile +from collections import OrderedDict + +from crazyneuraluser.UBAR_code.ontology import all_domains + +# 2.0 +data_path = "data/preprocessed/UBAR/gen_usr_utt_experiment_data.json" +save_path = "data/interim/gen_usr_utts/multi-woz-analysis/" +save_path_exp = "data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/" +# 2.1 +# data_path = 'data/raw/UBAR/MultiWOZ_2.1/' +# save_path = 'data/interim/multi-woz-2.1-analysis/' +# save_path_exp = 'data/preprocessed/multi-woz-2.1-processed/' +data_file = "data.json" +domains = all_domains +# all_domains = ['restaurant', 'hotel', 'attraction', 'train', 'taxi', 'police', 'hospital'] + + +def analysis(): + compressed_raw_data = {} + goal_of_dials = {} + req_slots = {} + info_slots = {} + dom_count = {} + dom_fnlist = {} + all_domain_specific_slots = set() + for domain in domains: + req_slots[domain] = [] + info_slots[domain] = [] + + # archive = zipfile.ZipFile(data_path + data_file + ".zip", "r") + # data = archive.open(data_file, "r").read().decode("utf-8").lower() + data = open(data_path, "r").read().lower() + ref_nos = list(set(re.findall(r"\"reference\"\: \"(\w+)\"", data))) + data = json.loads(data) + + for fn, dial in data.items(): + goals = dial["goal"] + logs = dial["log"] + + # get compressed_raw_data and goal_of_dials + compressed_raw_data[fn] = {"goal": {}, "log": []} + goal_of_dials[fn] = {} + for dom, goal in goals.items(): # get goal of domains that are in demmand + if dom != "topic" and dom != "message" and goal: + compressed_raw_data[fn]["goal"][dom] = goal + goal_of_dials[fn][dom] = goal + + for turn in logs: + if not turn["metadata"]: # user's turn + compressed_raw_data[fn]["log"].append({"text": turn["text"]}) + else: # system's turn + meta = turn["metadata"] + turn_dict = {"text": turn["text"], "metadata": {}} + for ( + dom, + book_semi, + ) in meta.items(): # for every domain, sys updates "book" and "semi" + book, semi = book_semi["book"], book_semi["semi"] + record = False + for ( + slot, + value, + ) in book.items(): # record indicates non-empty-book domain + if value not in ["", []]: + record = True + if record: + turn_dict["metadata"][dom] = {} + turn_dict["metadata"][dom]["book"] = book # add that domain's book + record = False + for ( + slot, + value, + ) in semi.items(): # here record indicates non-empty-semi domain + if value not in ["", []]: + record = True + break + if record: + for s, v in copy.deepcopy(semi).items(): + if v == "not mentioned": + del semi[s] + if not turn_dict["metadata"].get(dom): + turn_dict["metadata"][dom] = {} + turn_dict["metadata"][dom]["semi"] = semi # add that domain's semi + compressed_raw_data[fn]["log"].append(turn_dict) # add to log the compressed turn_dict + + # get domain statistics + dial_type = ( + "multi" if "mul" in fn or "MUL" in fn else "single" + ) # determine the dialog's type: sinle or multi + if fn in ["pmul2756.json", "pmul4958.json", "pmul3599.json"]: + dial_type = "single" + dial_domains = [dom for dom in domains if goals[dom]] # domains that are in demmand + dom_str = "" + for dom in dial_domains: + if not dom_count.get(dom + "_" + dial_type): # count each domain type, with single or multi considered + dom_count[dom + "_" + dial_type] = 1 + else: + dom_count[dom + "_" + dial_type] += 1 + if not dom_fnlist.get(dom + "_" + dial_type): # keep track the file number of each domain type + dom_fnlist[dom + "_" + dial_type] = [fn] + else: + dom_fnlist[dom + "_" + dial_type].append(fn) + dom_str += "%s_" % dom + dom_str = dom_str[:-1] # substract the last char in dom_str + if dial_type == "multi": # count multi-domains + if not dom_count.get(dom_str): + dom_count[dom_str] = 1 + else: + dom_count[dom_str] += 1 + if not dom_fnlist.get(dom_str): + dom_fnlist[dom_str] = [fn] + else: + dom_fnlist[dom_str].append(fn) + ###### + + # get informable and requestable slots statistics + for domain in domains: + info_ss = goals[domain].get("info", {}) + book_ss = goals[domain].get("book", {}) + req_ss = goals[domain].get("reqt", {}) + for info_s in info_ss: + all_domain_specific_slots.add(domain + "-" + info_s) + if info_s not in info_slots[domain]: + info_slots[domain] += [info_s] + for book_s in book_ss: + if "book_" + book_s not in info_slots[domain] and book_s not in [ + "invalid", + "pre_invalid", + ]: + all_domain_specific_slots.add(domain + "-" + book_s) + info_slots[domain] += ["book_" + book_s] + for req_s in req_ss: + if req_s not in req_slots[domain]: + req_slots[domain] += [req_s] + + # result statistics + if not os.path.exists(save_path): + os.mkdir(save_path) + if not os.path.exists(save_path_exp): + os.mkdir(save_path_exp) + with open(save_path + "req_slots.json", "w") as sf: + json.dump(req_slots, sf, indent=2) + with open(save_path + "info_slots.json", "w") as sf: + json.dump(info_slots, sf, indent=2) + with open(save_path + "all_domain_specific_info_slots.json", "w") as sf: + json.dump(list(all_domain_specific_slots), sf, indent=2) + print("slot num:", len(list(all_domain_specific_slots))) + with open(save_path + "goal_of_each_dials.json", "w") as sf: + json.dump(goal_of_dials, sf, indent=2) + with open(save_path + "compressed_data.json", "w") as sf: + json.dump(compressed_raw_data, sf, indent=2) + with open(save_path + "domain_count.json", "w") as sf: + single_count = [d for d in dom_count.items() if "single" in d[0]] + multi_count = [d for d in dom_count.items() if "multi" in d[0]] + other_count = [d for d in dom_count.items() if "multi" not in d[0] and "single" not in d[0]] + dom_count_od = OrderedDict(single_count + multi_count + other_count) + json.dump(dom_count_od, sf, indent=2) + with open(save_path_exp + "reference_no.json", "w") as sf: + json.dump(ref_nos, sf, indent=2) + with open(save_path_exp + "domain_files.json", "w") as sf: + json.dump(dom_fnlist, sf, indent=2) + + +if __name__ == "__main__": + analysis() diff --git a/scripts/UBAR_code/interaction/UBAR_interact.py b/scripts/UBAR_code/interaction/UBAR_interact.py new file mode 100644 index 0000000000000000000000000000000000000000..b04edc3a8a71a184eaa671ae07b0e715d5e4d2dd --- /dev/null +++ b/scripts/UBAR_code/interaction/UBAR_interact.py @@ -0,0 +1,457 @@ +import sys +import torch +import random +import string + +# import bcolors +from omegaconf import OmegaConf +from transformers import GPT2LMHeadModel, GPT2Tokenizer + +from crazyneuraluser.UBAR_code.config import global_config as cfg +from crazyneuraluser.UBAR_code.reader import MultiWozReader +from crazyneuraluser.UBAR_code.db_ops import MultiWozDB + +from typing import List + + +class bcolors: + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKCYAN = "\033[96m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" + + +class UbarSystemModel: # may inherit convlab or not, just like andy's + def __init__(self, name: str, checkpoint_path: str, model_config_path: str): + + self.tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_path) + self.model = GPT2LMHeadModel.from_pretrained(checkpoint_path) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.name = name + self.turn_domain = ["general"] # returns a list of one string that is the domain e.g. 'taxi' + # (this is because of the way the db_ops.py deals with the domain. It should really be a string.) + + self.ubar_status = {"dialogue_terminate": False} + + self.print_intermediary_info = False + + self.config = OmegaConf.load(model_config_path) + self.previous_turn = {"user": [], "bspn": [], "aspn": [], "db": []} + + # NB: best to use corpus goals to guide interactions - baselines/simulate_agent.py allows that. + + # initialize multiwoz reader and db_ops + self.reader = MultiWozReader(self.tokenizer) + self.db = MultiWozDB(self.config.dbs_path) + + def lexicalize_sys_response(self, sys_response, domain_hits, decoded_belief_state_subseq) -> str: + lexicalized_sys_response = "" + + # Track entities already filled e.g. if there are 3 restaurants track which have already been added to a slot + max_idx_of_added_entities = -1 + + # Fill slots with values from the DB (lexicalization) + for token in sys_response.split(): + token = token.strip(" .,;:") + if token.startswith("["): # It is a slot to be filled + + # Note in hotel there is specific price data too but to simplify things + # we just use the price range (e.g. moderate) + # TODO: there are different uses of price in different databases ('price' vs 'pricerange': + # need to deal with this appropriately below) + slots_to_db_keys_map = { + "[value_price]": "price", + "[value_pricerange]": "pricerange", + "[value_food]": "food", + "[value_area]": "area", + "[value_type]": "type", + "[value_phone]": "phone", + "[value_address]": "address", + "[value_leave]": "leave", + "[value_postcode]": "postcode", + "[value_id]": "id", + "[value_arrive]": "arrive", + "[value_stars]": "stars", + "[value_day]": "day", + "[value_destination]": "destination", + "[value_car]": "taxi_types", + "[value_departure]": "departure", + "[value_people]": "people", + "[value_stay]": "stay", + "[value_department]": "department", + "[value_time]": "time", + "[value_name]": "name", + "[value_reference]": "reference", + } + # Hospital domain is a strange outlier data structure + if self.turn_domain == ["hospital"] and token == "[value_address]": + token = "1 Addenbrooks Street" + elif self.turn_domain == ["hospital"] and token == "[value_postcode]": + token = "CB11QD" + + # So does taxi + elif self.turn_domain == ["taxi"] and token == "[value_phone]" and domain_hits != []: + token = domain_hits[0]["taxi_phone"] + + # Deal with value_name differently because there can be multiple + elif token == "[value_name]" and domain_hits != []: + token = domain_hits[max_idx_of_added_entities + 1]["name"] + max_idx_of_added_entities += 1 + + # This slot tells the user how many db hits there were matching their constraints + elif token == "[value_choice]" and domain_hits != []: + token = len(domain_hits) + + # Randomly generate the reference + elif token == "[value_reference]" and domain_hits != []: + token = "".join(random.choices(string.ascii_uppercase, k=10)) + + else: + # First check can we fill the token from the db results + db_success = False + if domain_hits != []: + for slot, db_key in slots_to_db_keys_map.items(): + if token == slot and db_key in domain_hits[0]: + token = domain_hits[0][db_key] + db_success = True + + # If we cannot, then try to fill it from the belief state by looking for a match + # in the belief state and then if there is a match adding the next token. + # This is not perfect as some are more than one word but its probably good enough. + if not db_success: + decoded_belief_states = decoded_belief_state_subseq.split() + for idx, belief_state_slot in enumerate(decoded_belief_states): + if token in slots_to_db_keys_map.keys(): + if slots_to_db_keys_map[token] == belief_state_slot: + token == decoded_belief_states[idx + 1] + + # Otherwise just leave the slot as it is as we have failed to fill it + + lexicalized_sys_response += str(token) + lexicalized_sys_response += " " + + return lexicalized_sys_response + + def set_turn_domain(self, belief_span_ids_subseq, sys_act_span_ids_subseq=None) -> None: + """ + IMPORTANT: use_system_act is not None when actually querying the DB to + lexicalise the system response. When it is None the Belief state NOT the system act is used to determine + the domain. In self.response() the DB is queried twice. The first time is using the Belief state as the system + act has not yet been generated, and it is only used to find out if there are matches in the DB for the current + domain + constraints. Then, after the system act is generated, we call the DB to actually get the results to + lexicalise the system response. It is much more important that the domain is correct for the second call, and + the system act is much more accurate at determining the domain. + """ + + if sys_act_span_ids_subseq is None: + decoded_belief_state_subseq = self.tokenizer.decode(belief_span_ids_subseq[1:-1]) + decoded_prev_belief_state_subseq = self.tokenizer.decode(self.previous_turn["bspn"][1:-1]) + + # If it is the first turn and the belief state is empty then set the domain to general + if self.previous_turn["bspn"] == [] and len(belief_span_ids_subseq) == 2: + self.turn_domain = ["general"] + return + + # If the belief state doesn't change then keep the same domain + if belief_span_ids_subseq == self.previous_turn["bspn"]: + return + + # The domain has changed, get the new one (from the right) + else: + # remove substring from string + if decoded_prev_belief_state_subseq in decoded_belief_state_subseq: + decoded_new_tokens = decoded_belief_state_subseq.replace("decoded_prev_belief_state_subseq", "") + most_recent_domain_in_belief_state = [ + [token.strip("[]") for token in decoded_new_tokens.split() if token.startswith("[")][-1] + ] + self.turn_domain = most_recent_domain_in_belief_state + else: + # Sometimes the previous belief state is not in the current belief state as + # the output changes very slightly (say by one word) - in this case just keep the same domain + # TODO: Could probably handle this better. + if self.print_intermediary_info: + print( + bcolors.YELLOW + + "!Previous belief state not in current belief state! Details below:" + + bcolors.ENDC + ) + print("Previous Belief State: " + decoded_prev_belief_state_subseq) + print("Current Belief State: " + decoded_belief_state_subseq) + + else: + decoded_sys_act_subseq = self.tokenizer.decode(sys_act_span_ids_subseq[1:-1]) + + most_recent_domain_in_sys_act = [ + [token.strip("[]") for token in decoded_sys_act_subseq.split() if token.startswith("[")][0] + ] + self.turn_domain = most_recent_domain_in_sys_act + + def get_domain_hits(self, decoded_belief_state_subseq) -> dict: + # Get hits from db based on belief state, unless its a general turn (no hits then) + constraint_dict = self.reader.bspan_to_constraint_dict(decoded_belief_state_subseq) + query_turn_domain = self.turn_domain[0] # db.queryJsons needs a string not a list (single domain) + # If the constraint dict doesn't contain any constraints for the current domain then pass an empty dict + if query_turn_domain in constraint_dict: + domain_hits = self.db.queryJsons(query_turn_domain, constraint_dict[query_turn_domain]) + else: + domain_hits = self.db.queryJsons(query_turn_domain, {}) + + return domain_hits + + def print_turn_intermediate_info(self, generated_subseq_ids_map) -> None: + print(bcolors.OKCYAN + "Turn domain: " + bcolors.ENDC + "[" + str(self.turn_domain[0]) + "]") + + belief_state = self.tokenizer.decode(generated_subseq_ids_map["bspn"]) + print(bcolors.OKCYAN + "Belief state: " + bcolors.ENDC + belief_state) + + db_output = self.tokenizer.decode(generated_subseq_ids_map["db"]) + print(bcolors.OKCYAN + "DB Output: " + bcolors.ENDC + db_output) + + sys_act = self.tokenizer.decode(generated_subseq_ids_map["aspn"]) + print(bcolors.OKCYAN + "System Act: " + bcolors.ENDC + sys_act) + + def _init_ubar_status(self) -> dict: + return {"dialogue_terminate": False} + + def init_session(self): + self.ubar_status = self._init_ubar_status() + self.previous_turn = {"user": [], "bspn": [], "aspn": [], "db": []} + self.turn_domain = ["general"] + + def is_terminated(self) -> bool: + """This should tell an external client whether the user model considers they have completed the task.""" + # return False + return self.ubar_status["dialogue_terminate"] + + def _activate_dialogue_terminate(self) -> None: + """Turn on the ubar status about dialogue termination""" + self.ubar_status["dialogue_terminate"] = True + + def add_torch_input_eval(self, inputs): + # inputs: context + inputs["context_tensor"] = torch.tensor([inputs["context"]]).to(self.device) + return inputs + + def prepare_input_for_model(self, user_utterance: str, turn_id: int) -> torch.Tensor: + # TODO: CONVERT DIALOGUE HISTORY TO TOKEN IDS + + tokenised_user_utterance = self.tokenizer.encode(" " + user_utterance + " ") + # In this application turn always only contains ["user"], not ["bspn", "aspn", "db"] etc. + turn = {"user": tokenised_user_utterance} + + first_turn = turn_id == 0 + inputs = self.reader.convert_turn_eval(turn, self.previous_turn, first_turn) + inputs = self.add_torch_input_eval(inputs) + + return inputs + + def decode_generated_bspn(self, generated) -> List[int]: + eos_b_id = self.tokenizer.encode([""])[0] + if eos_b_id in generated: + eos_b_idx = generated.index(eos_b_id) + else: + eos_b_idx = len(generated) - 1 + return generated[: eos_b_idx + 1] + + def decode_grenerated_act_resp(self, generated) -> dict: + """ + decode generated + return decoded['resp'] ('bspn', 'aspn') + """ + decoded = {} + eos_a_id = self.tokenizer.encode([""])[0] + eos_r_id = self.tokenizer.encode([""])[0] + # eos_b_id = self.tokenizer.encode([""])[0] + + # eos_r may not exists if gpt2 generated repetitive words. + if eos_r_id in generated: + eos_r_idx = generated.index(eos_r_id) + else: + eos_r_idx = len(generated) - 1 + + if cfg.use_true_curr_aspn: # only predict resp + decoded["resp"] = generated[: eos_r_idx + 1] + else: # predicted aspn, resp + eos_a_idx = generated.index(eos_a_id) + decoded["aspn"] = generated[: eos_a_idx + 1] + decoded["resp"] = generated[eos_a_idx + 1 : eos_r_idx + 1] + return decoded + + def generate_ids_subseq_map(self, inputs): + + context_input_subseq = inputs["context"] + # decoded_context_input_subseq = self.tokenizer.decode(context_input_subseq) + # Check if model has put duplicate tags in the context and if so remove one of the duplicates + # Yes this is kind of hacky, but UBAR seems to learn to duplicate certain tags - I don't know why + # Also instead of decoding and encoding here tags could be checked with their ids - but time is short... + # cleaned_decoded_list = [] + # prev_token = "" + # for token in decoded_context_input_subseq.split(): + # if token.startswith("<") and token.endswith(">"): # It is a tag + # if token == prev_token: # It is a duplicate tag + # continue + # cleaned_decoded_list.append(token) + # prev_token = token + # decoded_context_input_subseq = " ".join(cleaned_decoded_list) + # context_input_subseq = self.tokenizer.encode(decoded_context_input_subseq) + + context_input_subeq_tensor = inputs["context_tensor"] + + # TODO: FIND OUT BY COMPARING WITH MODEL.VALIDATE() how to calculate context_length + context_length = len(context_input_subseq) + + belief_state_ids = self.model.generate( + input_ids=context_input_subeq_tensor, + max_length=context_length + 60, + temperature=0.7, + top_p=1, + num_beams=1, + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.encode([""])[0], + ) + gen_belief_state_token_ids = belief_state_ids[0].cpu().numpy().tolist() # type: list[int] + belief_span_ids_subseq = self.decode_generated_bspn( + gen_belief_state_token_ids[context_length - 1 :] + ) # type: list[int] + + self.set_turn_domain(belief_span_ids_subseq) + + db_result = self.reader.bspan_to_DBpointer( + self.tokenizer.decode(belief_span_ids_subseq), self.turn_domain + ) # type: str + db_ids_subseq = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + db_result + " ") + ) + self.tokenizer.encode([""]) + + # TODO: context_input_subseq is already a tensor but the other two subseqs aren't - why? + act_response_gen_input_subseq = context_input_subseq + belief_span_ids_subseq + db_ids_subseq + act_response_gen_input_subseq_tensor = torch.tensor([act_response_gen_input_subseq]).to(self.device) + context_length = len(act_response_gen_input_subseq) + + outputs_db = self.model.generate( + input_ids=act_response_gen_input_subseq_tensor, + max_length=context_length + 80, + temperature=0.7, + top_p=1, + num_beams=1, + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.encode([""])[0], + ) + generated_act_resp_token_ids = outputs_db[0].cpu().numpy().tolist() # type: list[int] + generated_act_resp_token_ids = generated_act_resp_token_ids[context_length - 1 :] + + try: + generated_subseq_ids_map = self.decode_grenerated_act_resp(generated_act_resp_token_ids) + # TODO: IF YOU WANT Option b) then you just read the ['resp'] key and convert to string using huggingface; + # that would be sys_response; Obviously, this applies to Option a as well + generated_subseq_ids_map["bspn"] = belief_span_ids_subseq + # TODO: Option a) STORE THESE MAPPINGS IN SELF.CONTEXT IF YOU WANT TO HAVE + # {U_1, BS_1, DB_1, A_1, R_1, U_2, BS_2... history} + + generated_subseq_ids_map["db"] = db_ids_subseq + generated_subseq_ids_map["labels"] = context_input_subseq + + except ValueError: + generated_subseq_ids_map = {"resp": [], "bspn": [], "aspn": [], "db": [], "labels": []} + + # IMPORTANT: this is how all of the previous state is updated (appended) after each turn + # Update self.previous_turn to track state to be fed into GPT2 + for k, v in generated_subseq_ids_map.items(): + self.previous_turn[k] = v + + if self.print_intermediary_info: + self.print_turn_intermediate_info(generated_subseq_ids_map) + + return generated_subseq_ids_map + + def response(self, usr_utterance: str, turn_id: int) -> str: + + if usr_utterance == "Goodbye": + self._activate_dialogue_terminate() + return "Session Terminated by User" + + inputs = self.prepare_input_for_model(usr_utterance, turn_id) + + generated_subseq_ids_map = self.generate_ids_subseq_map(inputs) + belief_span_ids_subseq = generated_subseq_ids_map["bspn"] + + sys_response = self.tokenizer.decode(generated_subseq_ids_map["resp"][1:-1]) + + prev_turn_domain = self.turn_domain + sys_act_span_ids_subseq = generated_subseq_ids_map["aspn"] + self.set_turn_domain(belief_span_ids_subseq, sys_act_span_ids_subseq) + + if self.turn_domain != ["general"]: + # If the domain changes when reading the system response, then we need to re-do the generation process + # for both the belief state and the system action and response. We do this because self.get_domain_hits() + # will break if the domain is different when querying the DB for the second time here than when it was + # originally queried above, due to the constraint dict it uses that is generated from the belief state + # How can the belief state domain and the system act domain be different? Bunch of things, for example: + # When asking for the police the belief state may be empty (so 'general' domain) + # but then the system action will have [police]. + if prev_turn_domain != self.turn_domain: + if self.print_intermediary_info: + print( + bcolors.RED + + "Domain changed from {} to {}".format(prev_turn_domain, self.turn_domain) + + bcolors.RED + ) + generated_subseq_ids_map = self.generate_ids_subseq_map(inputs) + sys_response = self.tokenizer.decode(generated_subseq_ids_map["resp"][1:-1]) + + decoded_belief_state_subseq = self.tokenizer.decode(belief_span_ids_subseq) + domain_hits = self.get_domain_hits(decoded_belief_state_subseq) + # print(bcolors.UNDERLINE + "Domain hits: \n" + bcolors.ENDC, domain_hits) # for debugging + + sys_response = self.lexicalize_sys_response(sys_response, domain_hits, decoded_belief_state_subseq) + + return sys_response + + +def interact(checkpoint_path): + sys_model = UbarSystemModel("UBAR_sys_model", checkpoint_path, "scripts/UBAR_code/interaction/config.yaml") + # TODO: Fix this hardcoded variable (should be in config) + sys_model.print_intermediary_info = True + + for dial_id in range(1, 11): + print(f"In dialogue {dial_id}") + + # Reset state after each dialog + sys_model.init_session() + + user_utt = input(bcolors.GREEN + "Enter user response here: " + bcolors.ENDC) + + for turn_id in range(100): + try: + sys_response = sys_model.response(user_utt, turn_id) + # There are a lot of edge case bugs that are possible that could break the current turn. If so, continue + # to ensure a large run across the dataset isn't ruined by a single bad turn. + except Exception() as e: + print(bcolors.RED + "Exception: {}".format(e) + bcolors.ENDC) + continue + + if sys_model.is_terminated(): + print(bcolors.RED + sys_response + bcolors.ENDC) + print(bcolors.RED + "---" * 30 + bcolors.ENDC) + break + + print(bcolors.YELLOW + "System: " + bcolors.ENDC + sys_response) + print("---" * 30) + + # next turn materials + user_utt = input(bcolors.GREEN + "Enter user response here: " + bcolors.ENDC) + + +if __name__ == "__main__": + if len(sys.argv) == 1: + print("Wrong argument!") + print("Usage: python UBAR_interact.py checkpoint_path") + sys.exit(1) + + checkpoint_path = sys.argv[1] + interact(checkpoint_path) diff --git a/scripts/UBAR_code/interaction/__init__.py b/scripts/UBAR_code/interaction/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/UBAR_code/interaction/config.yaml b/scripts/UBAR_code/interaction/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..260afc9f5f844c10fecd96d8b48e910942e830b0 --- /dev/null +++ b/scripts/UBAR_code/interaction/config.yaml @@ -0,0 +1,23 @@ +model: + path: "./models/UBAR/experiments/distilgpt-2_sd11_lr0.0001_bs16_ga2/epoch50_trloss0.59_gpt2" + goal_update: + finish_inform: "loose" # loose or strict + +schema_path: "scripts/user_model_code/interaction/schema.json" + +decode: + dec_max_len: 1024 + num_beams: 1 + temperature: 1.0 + do_sample: False + +use_all_previous_context: False + +dbs_path: + "attraction": "data/preprocessed/UBAR/db_processed/attraction_db_processed.json" + "hospital": "data/preprocessed/UBAR/db_processed/hospital_db_processed.json" + "hotel": "data/preprocessed/UBAR/db_processed/hotel_db_processed.json" + "police": "data/preprocessed/UBAR/db_processed/police_db_processed.json" + "restaurant": "data/preprocessed/UBAR/db_processed/restaurant_db_processed.json" + "taxi": "data/preprocessed/UBAR/db_processed/taxi_db_processed.json" + "train": "data/preprocessed/UBAR/db_processed/train_db_processed.json" \ No newline at end of file diff --git a/scripts/UBAR_code/preprocess.py b/scripts/UBAR_code/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..a2438a34c69300e4248a334d29efce9539b934f5 --- /dev/null +++ b/scripts/UBAR_code/preprocess.py @@ -0,0 +1,576 @@ +import copy +import json +import os +import re +import zipfile +from collections import OrderedDict + +import spacy +from tqdm import tqdm + +from crazyneuraluser.UBAR_code import ontology, utils +from crazyneuraluser.UBAR_code.clean_dataset import clean_slot_values, clean_text +from crazyneuraluser.UBAR_code.config import global_config as cfg +from crazyneuraluser.UBAR_code.db_ops import MultiWozDB + + +# value_set.json, all the domain[slot] values in datasets +def get_db_values(value_set_path): + processed = {} + bspn_word = [] + nlp = spacy.load("en_core_web_sm") + + with open(value_set_path, "r") as f: # read value set file in lower + value_set = json.loads(f.read().lower()) + + with open("data/raw/UBAR/db/ontology.json", "r") as f: # read ontology in lower, all the domain-slot values + otlg = json.loads(f.read().lower()) + + for ( + domain, + slots, + ) in value_set.items(): # add all informable slots to bspn_word, create lists holder for values + processed[domain] = {} + bspn_word.append("[" + domain + "]") + for slot, values in slots.items(): + s_p = ontology.normlize_slot_names.get(slot, slot) + if s_p in ontology.informable_slots[domain]: + bspn_word.append(s_p) + processed[domain][s_p] = [] + + for ( + domain, + slots, + ) in value_set.items(): # add all words of values of informable slots to bspn_word + for slot, values in slots.items(): + s_p = ontology.normlize_slot_names.get(slot, slot) + if s_p in ontology.informable_slots[domain]: + for v in values: + _, v_p = clean_slot_values(domain, slot, v) + v_p = " ".join([token.text for token in nlp(v_p)]).strip() + processed[domain][s_p].append(v_p) + for x in v_p.split(): + if x not in bspn_word: + bspn_word.append(x) + + for domain_slot, values in otlg.items(): # split domain-slots to domains and slots + domain, slot = domain_slot.split("-") + if domain == "bus": + domain = "taxi" + if slot == "price range": + slot = "pricerange" + if slot == "book stay": + slot = "stay" + if slot == "book day": + slot = "day" + if slot == "book people": + slot = "people" + if slot == "book time": + slot = "time" + if slot == "arrive by": + slot = "arrive" + if slot == "leave at": + slot = "leave" + if slot == "leaveat": + slot = "leave" + # add all slots and words of values if not already in processed and bspn_word + if slot not in processed[domain]: + processed[domain][slot] = [] + bspn_word.append(slot) + for v in values: + _, v_p = clean_slot_values(domain, slot, v) + v_p = " ".join([token.text for token in nlp(v_p)]).strip() + if v_p not in processed[domain][slot]: + processed[domain][slot].append(v_p) + for x in v_p.split(): + if x not in bspn_word: + bspn_word.append(x) + + with open(value_set_path.replace(".json", "_processed.json"), "w") as f: + json.dump(processed, f, indent=2) # save processed.json + with open("data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/bspn_word_collection.json", "w") as f: + json.dump(bspn_word, f, indent=2) # save bspn_word + + print("DB value set processed! ") + + +def preprocess_db(db_paths): # apply clean_slot_values to all dbs + dbs = {} + nlp = spacy.load("en_core_web_sm") + for domain in ontology.all_domains: + with open(db_paths[domain], "r") as f: # for every db_domain, read json file + dbs[domain] = json.loads(f.read().lower()) + # entry has information about slots of said domain + for idx, entry in enumerate(dbs[domain]): + new_entry = copy.deepcopy(entry) + for key, value in entry.items(): # key = slot + if type(value) is not str: + continue + del new_entry[key] + key, value = clean_slot_values(domain, key, value) + tokenize_and_back = " ".join([token.text for token in nlp(value)]).strip() + new_entry[key] = tokenize_and_back + dbs[domain][idx] = new_entry + with open(db_paths[domain].replace(".json", "_processed.json"), "w") as f: + json.dump(dbs[domain], f, indent=2) + print("[%s] DB processed! " % domain) + + +class DataPreprocessor(object): + def __init__(self): + self.nlp = spacy.load("en_core_web_sm") + self.db = MultiWozDB(cfg.dbs) # load all processed dbs + data_path = "data/preprocessed/UBAR/gen_usr_utt_experiment_data_with_span_full.json" + # archive = zipfile.ZipFile(data_path + ".zip", "r") + # self.convlab_data = json.loads(archive.open(data_path.split("/")[-1], "r").read().lower()) + self.convlab_data = json.loads(open(data_path, "r").read().lower()) + self.delex_sg_valdict_path = "data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/delex_single_valdict.json" + self.delex_mt_valdict_path = "data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/delex_multi_valdict.json" + self.ambiguous_val_path = "data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/ambiguous_values.json" + self.delex_refs_path = "data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/reference_no.json" + self.delex_refs = json.loads(open(self.delex_refs_path, "r").read()) + if not os.path.exists(self.delex_sg_valdict_path): + ( + self.delex_sg_valdict, + self.delex_mt_valdict, + self.ambiguous_vals, + ) = self.get_delex_valdict() + else: + self.delex_sg_valdict = json.loads(open(self.delex_sg_valdict_path, "r").read()) + self.delex_mt_valdict = json.loads(open(self.delex_mt_valdict_path, "r").read()) + self.ambiguous_vals = json.loads(open(self.ambiguous_val_path, "r").read()) + + self.vocab = utils.Vocab(cfg.vocab_size) + + def delex_by_annotation(self, dial_turn): + u = dial_turn["text"].split() + span = dial_turn["span_info"] + for s in span: + slot = s[1] + if slot == "open": + continue + if ontology.da_abbr_to_slot_name.get(slot): + slot = ontology.da_abbr_to_slot_name[slot] + for idx in range(s[3], s[4] + 1): + u[idx] = "" + try: + u[s[3]] = "[value_" + slot + "]" + except Exception: + u[5] = "[value_" + slot + "]" + u_delex = " ".join([t for t in u if t != ""]) + u_delex = u_delex.replace("[value_address] , [value_address] , [value_address]", "[value_address]") + u_delex = u_delex.replace("[value_address] , [value_address]", "[value_address]") + u_delex = u_delex.replace("[value_name] [value_name]", "[value_name]") + u_delex = u_delex.replace("[value_name]([value_phone] )", "[value_name] ( [value_phone] )") + return u_delex + + def delex_by_valdict(self, text): + text = clean_text(text) + + text = re.sub(r"\d{5}\s?\d{5,7}", "[value_phone]", text) + text = re.sub(r"\d[\s-]stars?", "[value_stars]", text) + text = re.sub(r"\$\d+|\$?\d+.?(\d+)?\s(pounds?|gbps?)", "[value_price]", text) + text = re.sub(r"tr[\d]{4}", "[value_id]", text) + text = re.sub( + r"([a-z]{1}[\. ]?[a-z]{1}[\. ]?\d{1,2}[, ]+\d{1}[\. ]?[a-z]{1}[\. ]?[a-z]{1}|[a-z]{2}\d{2}[a-z]{2})", + "[value_postcode]", + text, + ) + + for value, slot in self.delex_mt_valdict.items(): + text = text.replace(value, "[value_%s]" % slot) + + for value, slot in self.delex_sg_valdict.items(): + tokens = text.split() + for idx, tk in enumerate(tokens): + if tk == value: + tokens[idx] = "[value_%s]" % slot + text = " ".join(tokens) + + for ambg_ent in self.ambiguous_vals: + # ely is a place, but appears in words like moderately + start_idx = text.find(" " + ambg_ent) + if start_idx == -1: + continue + front_words = text[:start_idx].split() + ent_type = "time" if ":" in ambg_ent else "place" + + for fw in front_words[::-1]: + if fw in [ + "arrive", + "arrives", + "arrived", + "arriving", + "arrival", + "destination", + "there", + "reach", + "to", + "by", + "before", + ]: + slot = "[value_arrive]" if ent_type == "time" else "[value_destination]" + text = re.sub(" " + ambg_ent, " " + slot, text) + elif fw in [ + "leave", + "leaves", + "leaving", + "depart", + "departs", + "departing", + "departure", + "from", + "after", + "pulls", + ]: + slot = "[value_leave]" if ent_type == "time" else "[value_departure]" + text = re.sub(" " + ambg_ent, " " + slot, text) + + text = text.replace("[value_car] [value_car]", "[value_car]") + return text + + def get_delex_valdict( + self, + ): + skip_entry_type = { + "taxi": ["taxi_phone"], + "police": ["id"], + "hospital": ["id"], + "hotel": [ + "id", + "location", + "internet", + "parking", + "takesbookings", + "stars", + "price", + "n", + "postcode", + "phone", + ], + "attraction": [ + "id", + "location", + "pricerange", + "price", + "openhours", + "postcode", + "phone", + ], + "train": ["price", "id"], + "restaurant": [ + "id", + "location", + "introduction", + "signature", + "type", + "postcode", + "phone", + ], + } + entity_value_to_slot = {} + ambiguous_entities = [] + for domain, db_data in self.db.dbs.items(): + print("Processing entity values in [%s]" % domain) + if domain != "taxi": + for db_entry in db_data: + for slot, value in db_entry.items(): + if slot not in skip_entry_type[domain]: + if type(value) is not str: + raise TypeError("value '%s' in domain '%s' should be rechecked" % (slot, domain)) + else: + slot, value = clean_slot_values(domain, slot, value) + value = " ".join([token.text for token in self.nlp(value)]).strip() + if value in entity_value_to_slot and entity_value_to_slot[value] != slot: + # print(value, ": ",entity_value_to_slot[value], slot) + ambiguous_entities.append(value) + entity_value_to_slot[value] = slot + else: # taxi db specific + db_entry = db_data[0] + for slot, ent_list in db_entry.items(): + if slot not in skip_entry_type[domain]: + for ent in ent_list: + entity_value_to_slot[ent] = "car" + ambiguous_entities = set(ambiguous_entities) + ambiguous_entities.remove("cambridge") + ambiguous_entities = list(ambiguous_entities) + for amb_ent in ambiguous_entities: # departure or destination? arrive time or leave time? + entity_value_to_slot.pop(amb_ent) + entity_value_to_slot["parkside"] = "address" + entity_value_to_slot["parkside, cambridge"] = "address" + entity_value_to_slot["cambridge belfry"] = "name" + entity_value_to_slot["hills road"] = "address" + entity_value_to_slot["hills rd"] = "address" + entity_value_to_slot["Parkside Police Station"] = "name" + + single_token_values = {} + multi_token_values = {} + for val, slt in entity_value_to_slot.items(): + if val in ["cambridge"]: + continue + if len(val.split()) > 1: + multi_token_values[val] = slt + else: + single_token_values[val] = slt + + with open(self.delex_sg_valdict_path, "w") as f: + single_token_values = OrderedDict( + sorted(single_token_values.items(), key=lambda kv: len(kv[0]), reverse=True) + ) + json.dump(single_token_values, f, indent=2) + print("single delex value dict saved!") + with open(self.delex_mt_valdict_path, "w") as f: + multi_token_values = OrderedDict( + sorted(multi_token_values.items(), key=lambda kv: len(kv[0]), reverse=True) + ) + json.dump(multi_token_values, f, indent=2) + print("multi delex value dict saved!") + with open(self.ambiguous_val_path, "w") as f: + json.dump(ambiguous_entities, f, indent=2) + print("ambiguous value dict saved!") + + return single_token_values, multi_token_values, ambiguous_entities + + def preprocess_main(self, save_path=None, is_test=False): + """ """ + data = {} + count = 0 + self.unique_da = {} + ordered_sysact_dict = {} + for fn, raw_dial in tqdm(list(self.convlab_data.items())): + count += 1 + # if count == 100: + # break + + compressed_goal = {} # for every dialog, keep track the goal, domains, requests + dial_domains, dial_reqs = [], [] + for dom, g in raw_dial["goal"].items(): + if dom != "topic" and dom != "message" and g: + if g.get("reqt"): # request info. eg. postcode/address/phone + # normalize request slots + for i, req_slot in enumerate(g["reqt"]): + if ontology.normlize_slot_names.get(req_slot): + g["reqt"][i] = ontology.normlize_slot_names[req_slot] + dial_reqs.append(g["reqt"][i]) + compressed_goal[dom] = g + if dom in ontology.all_domains: + dial_domains.append(dom) + + dial_reqs = list(set(dial_reqs)) + + dial = {"goal": compressed_goal, "log": []} + single_turn = {} + constraint_dict = OrderedDict() + prev_constraint_dict = {} + prev_turn_domain = ["general"] + ordered_sysact_dict[fn] = {} + + for turn_num, dial_turn in enumerate(raw_dial["log"]): + # for user turn, have text + # sys turn: text, belief states(metadata), dialog_act, span_info + dial_state = dial_turn["metadata"] + if not dial_state: # user + # delexicalize user utterance, either by annotation or by val_dict + u = " ".join(clean_text(dial_turn["text"]).split()) + + # NOTE: Commenting out delexicalisation because it is not used and + # breaks when I use generated user dialogues for some reason + + # if dial_turn["span_info"]: + # u_delex = clean_text(self.delex_by_annotation(dial_turn)) + # else: + # u_delex = self.delex_by_valdict(dial_turn["text"]) + + single_turn["user"] = u + # single_turn["user_delex"] = u_delex + + else: # system + # delexicalize system response, either by annotation or by val_dict + if dial_turn["span_info"]: + s_delex = clean_text(self.delex_by_annotation(dial_turn)) + else: + if not dial_turn["text"]: + print(fn) + s_delex = self.delex_by_valdict(dial_turn["text"]) + single_turn["resp"] = s_delex + + # get belief state, semi=informable/book=requestable, put into constraint_dict + for domain in dial_domains: + if not constraint_dict.get(domain): + constraint_dict[domain] = OrderedDict() + info_sv = dial_state[domain]["semi"] + for s, v in info_sv.items(): + s, v = clean_slot_values(domain, s, v) + if len(v.split()) > 1: + v = " ".join([token.text for token in self.nlp(v)]).strip() + if v != "": + constraint_dict[domain][s] = v + book_sv = dial_state[domain]["book"] + for s, v in book_sv.items(): + if s == "booked": + continue + s, v = clean_slot_values(domain, s, v) + if len(v.split()) > 1: + v = " ".join([token.text for token in self.nlp(v)]).strip() + if v != "": + constraint_dict[domain][s] = v + + constraints = [] # list in format of [domain] slot value + cons_delex = [] + turn_dom_bs = [] + for domain, info_slots in constraint_dict.items(): + if info_slots: + constraints.append("[" + domain + "]") + cons_delex.append("[" + domain + "]") + for slot, value in info_slots.items(): + constraints.append(slot) + constraints.extend(value.split()) + cons_delex.append(slot) + if domain not in prev_constraint_dict: + turn_dom_bs.append(domain) + elif prev_constraint_dict[domain] != constraint_dict[domain]: + turn_dom_bs.append(domain) + + sys_act_dict = {} + turn_dom_da = set() + for act in dial_turn["dialog_act"]: + d, a = act.split("-") # split domain-act + turn_dom_da.add(d) + turn_dom_da = list(turn_dom_da) + if len(turn_dom_da) != 1 and "general" in turn_dom_da: + turn_dom_da.remove("general") + if len(turn_dom_da) != 1 and "booking" in turn_dom_da: + turn_dom_da.remove("booking") + + # get turn domain + turn_domain = turn_dom_bs + for dom in turn_dom_da: + if dom != "booking" and dom not in turn_domain: + turn_domain.append(dom) + if not turn_domain: + turn_domain = prev_turn_domain + if len(turn_domain) == 2 and "general" in turn_domain: + turn_domain.remove("general") + if len(turn_domain) == 2: + if len(prev_turn_domain) == 1 and prev_turn_domain[0] == turn_domain[1]: + turn_domain = turn_domain[::-1] + + # get system action + for dom in turn_domain: + sys_act_dict[dom] = {} + add_to_last_collect = [] + booking_act_map = {"inform": "offerbook", "book": "offerbooked"} + for act, params in dial_turn["dialog_act"].items(): + if act == "general-greet": + continue + d, a = act.split("-") + if d == "general" and d not in sys_act_dict: + sys_act_dict[d] = {} + if d == "booking": + d = turn_domain[0] + a = booking_act_map.get(a, a) + add_p = [] + for param in params: + p = param[0] + if p == "none": + continue + elif ontology.da_abbr_to_slot_name.get(p): + p = ontology.da_abbr_to_slot_name[p] + if p not in add_p: + add_p.append(p) + add_to_last = True if a in ["request", "reqmore", "bye", "offerbook"] else False + if add_to_last: + add_to_last_collect.append((d, a, add_p)) + else: + sys_act_dict[d][a] = add_p + for d, a, add_p in add_to_last_collect: + sys_act_dict[d][a] = add_p + + for d in copy.copy(sys_act_dict): + acts = sys_act_dict[d] + if not acts: + del sys_act_dict[d] + if "inform" in acts and "offerbooked" in acts: + for s in sys_act_dict[d]["inform"]: + sys_act_dict[d]["offerbooked"].append(s) + del sys_act_dict[d]["inform"] + + ordered_sysact_dict[fn][len(dial["log"])] = sys_act_dict + + sys_act = [] + if "general-greet" in dial_turn["dialog_act"]: + sys_act.extend(["[general]", "[greet]"]) + for d, acts in sys_act_dict.items(): + sys_act += ["[" + d + "]"] + for a, slots in acts.items(): + self.unique_da[d + "-" + a] = 1 + sys_act += ["[" + a + "]"] + sys_act += slots + + # get db pointers + matnums = self.db.get_match_num(constraint_dict) + match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1] + match = matnums[match_dom] + dbvec = self.db.addDBPointer(match_dom, match) + bkvec = self.db.addBookingPointer(dial_turn["dialog_act"]) + + # 4 database pointer for domains, 2 for booking + single_turn["pointer"] = ",".join([str(d) for d in dbvec + bkvec]) + single_turn["match"] = str(match) + single_turn["constraint"] = " ".join(constraints) + single_turn["cons_delex"] = " ".join(cons_delex) + single_turn["sys_act"] = " ".join(sys_act) + single_turn["turn_num"] = len(dial["log"]) + single_turn["turn_domain"] = " ".join(["[" + d + "]" for d in turn_domain]) + + prev_turn_domain = copy.deepcopy(turn_domain) + prev_constraint_dict = copy.deepcopy(constraint_dict) + + if "user" in single_turn: + dial["log"].append(single_turn) + for t in single_turn["user"].split() + single_turn["resp"].split() + constraints + sys_act: + self.vocab.add_word(t) + + # NOTE: Commenting out delexicalisation because it is not used and + # breaks when I use generated user dialogues for some reason + + # for t in single_turn["user_delex"].split(): + # if "[" in t and "]" in t and not t.startswith("[") and not t.endswith("]"): + # single_turn["user_delex"].replace(t, t[t.index("[") : t.index("]") + 1]) + # elif not self.vocab.has_word(t): + # self.vocab.add_word(t) + + single_turn = {} + + data[fn] = dial + # pprint(dial) + # if count == 20: + # break + self.vocab.construct() + self.vocab.save_vocab("data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/vocab") + with open("data/interim/gen_usr_utts/multi-woz-analysis/dialog_acts.json", "w") as f: + json.dump(ordered_sysact_dict, f, indent=2) + with open("data/interim/gen_usr_utts/multi-woz-analysis/dialog_act_type.json", "w") as f: + json.dump(self.unique_da, f, indent=2) + return data + + +if __name__ == "__main__": + db_paths = { + "attraction": "data/raw/UBAR/db/attraction_db.json", + "hospital": "data/raw/UBAR/db/hospital_db.json", + "hotel": "data/raw/UBAR/db/hotel_db.json", + "police": "data/raw/UBAR/db/police_db.json", + "restaurant": "data/raw/UBAR/db/restaurant_db.json", + "taxi": "data/raw/UBAR/db/taxi_db.json", + "train": "data/raw/UBAR/db/train_db.json", + } + get_db_values("data/raw/UBAR/db/value_set.json") + preprocess_db(db_paths) + dh = DataPreprocessor() + data = dh.preprocess_main() + if not os.path.exists("data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed"): + os.mkdir("data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed") + + with open("data/preprocessed_gen_usr_utts/UBAR/multi-woz-processed/data_for_ubar.json", "w") as f: + json.dump(data, f, indent=2) diff --git a/scripts/UBAR_code/preprocess2.1.py b/scripts/UBAR_code/preprocess2.1.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa75f9c338269130f3d97265dba0ca0f6d6cf13 --- /dev/null +++ b/scripts/UBAR_code/preprocess2.1.py @@ -0,0 +1,585 @@ +import copy +import json +import os +import re +import zipfile +from collections import OrderedDict + +import spacy +from tqdm import tqdm + +from crazyneuraluser.UBAR_code import ontology, utils +from crazyneuraluser.UBAR_code.clean_dataset import clean_slot_values, clean_text +from crazyneuraluser.UBAR_code.config import global_config as cfg +from crazyneuraluser.UBAR_code.db_ops import MultiWozDB + + +def get_db_values( + value_set_path, +): # value_set.json, all the domain[slot] values in datasets + processed = {} + bspn_word = [] + nlp = spacy.load("en_core_web_sm") + + with open(value_set_path, "r") as f: # read value set file in lower + value_set = json.loads(f.read().lower()) + + with open("db/ontology.json", "r") as f: # read ontology in lower, all the domain-slot values + otlg = json.loads(f.read().lower()) + + for ( + domain, + slots, + ) in value_set.items(): # add all informable slots to bspn_word, create lists holder for values + processed[domain] = {} + bspn_word.append("[" + domain + "]") + for slot, values in slots.items(): + s_p = ontology.normlize_slot_names.get(slot, slot) + if s_p in ontology.informable_slots[domain]: + bspn_word.append(s_p) + processed[domain][s_p] = [] + + for ( + domain, + slots, + ) in value_set.items(): # add all words of values of informable slots to bspn_word + for slot, values in slots.items(): + s_p = ontology.normlize_slot_names.get(slot, slot) + if s_p in ontology.informable_slots[domain]: + for v in values: + _, v_p = clean_slot_values(domain, slot, v) + v_p = " ".join([token.text for token in nlp(v_p)]).strip() + processed[domain][s_p].append(v_p) + for x in v_p.split(): + if x not in bspn_word: + bspn_word.append(x) + + for domain_slot, values in otlg.items(): # split domain-slots to domains and slots + domain, slot = domain_slot.split("-") + if domain == "bus": + domain = "taxi" + if slot == "price range": + slot = "pricerange" + if slot == "book stay": + slot = "stay" + if slot == "book day": + slot = "day" + if slot == "book people": + slot = "people" + if slot == "book time": + slot = "time" + if slot == "arrive by": + slot = "arrive" + if slot == "leave at": + slot = "leave" + if slot == "leaveat": + slot = "leave" + if slot not in processed[domain]: # add all slots and words of values if not already in processed and bspn_word + processed[domain][slot] = [] + bspn_word.append(slot) + for v in values: + _, v_p = clean_slot_values(domain, slot, v) + v_p = " ".join([token.text for token in nlp(v_p)]).strip() + if v_p not in processed[domain][slot]: + processed[domain][slot].append(v_p) + for x in v_p.split(): + if x not in bspn_word: + bspn_word.append(x) + + with open(value_set_path.replace(".json", "_processed.json"), "w") as f: + json.dump(processed, f, indent=2) # save processed.json + with open("data/preprocessed/UBAR/multi-woz-processed/bspn_word_collection.json", "w") as f: + json.dump(bspn_word, f, indent=2) # save bspn_word + + print("DB value set processed! ") + + +def preprocess_db(db_paths): # apply clean_slot_values to all dbs + dbs = {} + nlp = spacy.load("en_core_web_sm") + for domain in ontology.all_domains: + with open(db_paths[domain], "r") as f: # for every db_domain, read json file + dbs[domain] = json.loads(f.read().lower()) + for idx, entry in enumerate(dbs[domain]): # entry has information about slots of said domain + new_entry = copy.deepcopy(entry) + for key, value in entry.items(): # key = slot + if type(value) is not str: + continue + del new_entry[key] + key, value = clean_slot_values(domain, key, value) + tokenize_and_back = " ".join([token.text for token in nlp(value)]).strip() + new_entry[key] = tokenize_and_back + dbs[domain][idx] = new_entry + with open(db_paths[domain].replace(".json", "_processed.json"), "w") as f: + json.dump(dbs[domain], f, indent=2) + print("[%s] DB processed! " % domain) + + +# 2.1 +class DataPreprocessor(object): + def __init__(self): + self.nlp = spacy.load("en_core_web_sm") + self.db = MultiWozDB(cfg.dbs) # load all processed dbs + # data_path = 'data/multi-woz/annotated_user_da_with_span_full.json' + data_path = "data/raw/UBAR/MultiWOZ_2.1/data.json" + archive = zipfile.ZipFile(data_path + ".zip", "r") + self.convlab_data = json.loads(archive.open(data_path.split("/")[-1], "r").read().lower()) + # self.delex_sg_valdict_path = 'data/multi-woz-processed/delex_single_valdict.json' + # self.delex_mt_valdict_path = 'data/multi-woz-processed/delex_multi_valdict.json' + # self.ambiguous_val_path = 'data/multi-woz-processed/ambiguous_values.json' + # self.delex_refs_path = 'data/multi-woz-processed/reference_no.json' + self.delex_sg_valdict_path = "data/preprocessed/UBAR/multi-woz-2.1-processed/delex_single_valdict.json" + self.delex_mt_valdict_path = "data/preprocessed/UBAR/multi-woz-2.1-processed/delex_multi_valdict.json" + self.ambiguous_val_path = "data/preprocessed/UBAR/multi-woz-2.1-processed/ambiguous_values.json" + self.delex_refs_path = "data/preprocessed/UBAR/multi-woz-2.1-processed/reference_no.json" + self.delex_refs = json.loads(open(self.delex_refs_path, "r").read()) + if not os.path.exists(self.delex_sg_valdict_path): + ( + self.delex_sg_valdict, + self.delex_mt_valdict, + self.ambiguous_vals, + ) = self.get_delex_valdict() + else: + self.delex_sg_valdict = json.loads(open(self.delex_sg_valdict_path, "r").read()) + self.delex_mt_valdict = json.loads(open(self.delex_mt_valdict_path, "r").read()) + self.ambiguous_vals = json.loads(open(self.ambiguous_val_path, "r").read()) + + self.vocab = utils.Vocab(cfg.vocab_size) + + def delex_by_annotation(self, dial_turn): + # add by yyy in 13:48 0803 + u = dial_turn["text"].split() + # u = my_clean_text(dial_turn['text']).split() + ## + span = dial_turn["span_info"] + for s in span: + slot = s[1] + if slot == "open": + continue + if ontology.da_abbr_to_slot_name.get(slot): + slot = ontology.da_abbr_to_slot_name[slot] + for idx in range(s[3], s[4] + 1): + u[idx] = "" + try: + u[s[3]] = "[value_" + slot + "]" + except Exception: + u[5] = "[value_" + slot + "]" + u_delex = " ".join([t for t in u if t != ""]) + u_delex = u_delex.replace("[value_address] , [value_address] , [value_address]", "[value_address]") + u_delex = u_delex.replace("[value_address] , [value_address]", "[value_address]") + u_delex = u_delex.replace("[value_name] [value_name]", "[value_name]") + u_delex = u_delex.replace("[value_name]([value_phone] )", "[value_name] ( [value_phone] )") + return u_delex + + def delex_by_valdict(self, text): + text = clean_text(text) + + text = re.sub(r"\d{5}\s?\d{5,7}", "[value_phone]", text) + text = re.sub(r"\d[\s-]stars?", "[value_stars]", text) + text = re.sub(r"\$\d+|\$?\d+.?(\d+)?\s(pounds?|gbps?)", "[value_price]", text) + text = re.sub(r"tr[\d]{4}", "[value_id]", text) + text = re.sub( + r"([a-z]{1}[\. ]?[a-z]{1}[\. ]?\d{1,2}[, ]+\d{1}[\. ]?[a-z]{1}[\. ]?[a-z]{1}|[a-z]{2}\d{2}[a-z]{2})", + "[value_postcode]", + text, + ) + + for value, slot in self.delex_mt_valdict.items(): + text = text.replace(value, "[value_%s]" % slot) + + for value, slot in self.delex_sg_valdict.items(): + tokens = text.split() + for idx, tk in enumerate(tokens): + if tk == value: + tokens[idx] = "[value_%s]" % slot + text = " ".join(tokens) + + for ambg_ent in self.ambiguous_vals: + start_idx = text.find(" " + ambg_ent) # ely is a place, but appears in words like moderately + if start_idx == -1: + continue + front_words = text[:start_idx].split() + ent_type = "time" if ":" in ambg_ent else "place" + + for fw in front_words[::-1]: + if fw in [ + "arrive", + "arrives", + "arrived", + "arriving", + "arrival", + "destination", + "there", + "reach", + "to", + "by", + "before", + ]: + slot = "[value_arrive]" if ent_type == "time" else "[value_destination]" + text = re.sub(" " + ambg_ent, " " + slot, text) + elif fw in [ + "leave", + "leaves", + "leaving", + "depart", + "departs", + "departing", + "departure", + "from", + "after", + "pulls", + ]: + slot = "[value_leave]" if ent_type == "time" else "[value_departure]" + text = re.sub(" " + ambg_ent, " " + slot, text) + + text = text.replace("[value_car] [value_car]", "[value_car]") + return text + + def get_delex_valdict( + self, + ): + skip_entry_type = { + "taxi": ["taxi_phone"], + "police": ["id"], + "hospital": ["id"], + "hotel": [ + "id", + "location", + "internet", + "parking", + "takesbookings", + "stars", + "price", + "n", + "postcode", + "phone", + ], + "attraction": [ + "id", + "location", + "pricerange", + "price", + "openhours", + "postcode", + "phone", + ], + "train": ["price", "id"], + "restaurant": [ + "id", + "location", + "introduction", + "signature", + "type", + "postcode", + "phone", + ], + } + entity_value_to_slot = {} + ambiguous_entities = [] + for domain, db_data in self.db.dbs.items(): + print("Processing entity values in [%s]" % domain) + if domain != "taxi": + for db_entry in db_data: + for slot, value in db_entry.items(): + if slot not in skip_entry_type[domain]: + if type(value) is not str: + raise TypeError("value '%s' in domain '%s' should be rechecked" % (slot, domain)) + else: + slot, value = clean_slot_values(domain, slot, value) + value = " ".join([token.text for token in self.nlp(value)]).strip() + if value in entity_value_to_slot and entity_value_to_slot[value] != slot: + # print(value, ": ",entity_value_to_slot[value], slot) + ambiguous_entities.append(value) + entity_value_to_slot[value] = slot + else: # taxi db specific + db_entry = db_data[0] + for slot, ent_list in db_entry.items(): + if slot not in skip_entry_type[domain]: + for ent in ent_list: + entity_value_to_slot[ent] = "car" + ambiguous_entities = set(ambiguous_entities) + ambiguous_entities.remove("cambridge") + ambiguous_entities = list(ambiguous_entities) + for amb_ent in ambiguous_entities: # departure or destination? arrive time or leave time? + entity_value_to_slot.pop(amb_ent) + entity_value_to_slot["parkside"] = "address" + entity_value_to_slot["parkside, cambridge"] = "address" + entity_value_to_slot["cambridge belfry"] = "name" + entity_value_to_slot["hills road"] = "address" + entity_value_to_slot["hills rd"] = "address" + entity_value_to_slot["Parkside Police Station"] = "name" + + single_token_values = {} + multi_token_values = {} + for val, slt in entity_value_to_slot.items(): + if val in ["cambridge"]: + continue + if len(val.split()) > 1: + multi_token_values[val] = slt + else: + single_token_values[val] = slt + + with open(self.delex_sg_valdict_path, "w") as f: + single_token_values = OrderedDict( + sorted(single_token_values.items(), key=lambda kv: len(kv[0]), reverse=True) + ) + json.dump(single_token_values, f, indent=2) + print("single delex value dict saved!") + with open(self.delex_mt_valdict_path, "w") as f: + multi_token_values = OrderedDict( + sorted(multi_token_values.items(), key=lambda kv: len(kv[0]), reverse=True) + ) + json.dump(multi_token_values, f, indent=2) + print("multi delex value dict saved!") + with open(self.ambiguous_val_path, "w") as f: + json.dump(ambiguous_entities, f, indent=2) + print("ambiguous value dict saved!") + + return single_token_values, multi_token_values, ambiguous_entities + + def preprocess_main(self, save_path=None, is_test=False): + """ """ + data = {} + count = 0 + self.unique_da = {} + ordered_sysact_dict = {} + # yyy + for fn, raw_dial in tqdm(list(self.convlab_data.items())): + if fn in [ + "pmul4707.json", + "pmul2245.json", + "pmul4776.json", + "pmul3872.json", + "pmul4859.json", + ]: + continue + count += 1 + # if count == 100: + # break + + compressed_goal = {} # for every dialog, keep track the goal, domains, requests + dial_domains, dial_reqs = [], [] + for dom, g in raw_dial["goal"].items(): + if dom != "topic" and dom != "message" and g: + if g.get("reqt"): # request info. eg. postcode/address/phone + for i, req_slot in enumerate(g["reqt"]): # normalize request slots + if ontology.normlize_slot_names.get(req_slot): + g["reqt"][i] = ontology.normlize_slot_names[req_slot] + dial_reqs.append(g["reqt"][i]) + compressed_goal[dom] = g + if dom in ontology.all_domains: + dial_domains.append(dom) + + dial_reqs = list(set(dial_reqs)) + + dial = {"goal": compressed_goal, "log": []} + single_turn = {} + constraint_dict = OrderedDict() + prev_constraint_dict = {} + prev_turn_domain = ["general"] + ordered_sysact_dict[fn] = {} + + for turn_num, dial_turn in enumerate(raw_dial["log"]): + # for user turn, have text + # sys turn: text, belief states(metadata), dialog_act, span_info + dial_state = dial_turn["metadata"] + dial_turn["text"] = " ".join([t.text for t in self.nlp(dial_turn["text"])]) + if not dial_state: # user + # delexicalize user utterance, either by annotation or by val_dict + u = " ".join(clean_text(dial_turn["text"]).split()) + if "span_info" in dial_turn and dial_turn["span_info"]: + u_delex = clean_text(self.delex_by_annotation(dial_turn)) + else: + u_delex = self.delex_by_valdict(dial_turn["text"]) + + single_turn["user"] = u + single_turn["user_delex"] = u_delex + + else: # system + # delexicalize system response, either by annotation or by val_dict + if "span_info" in dial_turn and dial_turn["span_info"]: + s_delex = clean_text(self.delex_by_annotation(dial_turn)) + else: + if not dial_turn["text"]: + print(fn) + s_delex = self.delex_by_valdict(dial_turn["text"]) + single_turn["resp"] = s_delex + single_turn["nodelx_resp"] = " ".join(clean_text(dial_turn["text"]).split()) + + # get belief state, semi=informable/book=requestable, put into constraint_dict + for domain in dial_domains: + if not constraint_dict.get(domain): + constraint_dict[domain] = OrderedDict() + info_sv = dial_state[domain]["semi"] + for s, v in info_sv.items(): + s, v = clean_slot_values(domain, s, v) + if len(v.split()) > 1: + v = " ".join([token.text for token in self.nlp(v)]).strip() + if v != "": + constraint_dict[domain][s] = v + book_sv = dial_state[domain]["book"] + for s, v in book_sv.items(): + if s == "booked": + continue + s, v = clean_slot_values(domain, s, v) + if len(v.split()) > 1: + v = " ".join([token.text for token in self.nlp(v)]).strip() + if v != "": + constraint_dict[domain][s] = v + + constraints = [] # list in format of [domain] slot value + cons_delex = [] + turn_dom_bs = [] + for domain, info_slots in constraint_dict.items(): + if info_slots: + constraints.append("[" + domain + "]") + cons_delex.append("[" + domain + "]") + for slot, value in info_slots.items(): + constraints.append(slot) + constraints.extend(value.split()) + cons_delex.append(slot) + if domain not in prev_constraint_dict: + turn_dom_bs.append(domain) + elif prev_constraint_dict[domain] != constraint_dict[domain]: + turn_dom_bs.append(domain) + + sys_act_dict = {} + turn_dom_da = set() + for act in dial_turn["dialog_act"]: + d, a = act.split("-") # split domain-act + turn_dom_da.add(d) + turn_dom_da = list(turn_dom_da) + if len(turn_dom_da) != 1 and "general" in turn_dom_da: + turn_dom_da.remove("general") + if len(turn_dom_da) != 1 and "booking" in turn_dom_da: + turn_dom_da.remove("booking") + + # get turn domain + turn_domain = turn_dom_bs + for dom in turn_dom_da: + if dom != "booking" and dom not in turn_domain: + turn_domain.append(dom) + if not turn_domain: + turn_domain = prev_turn_domain + if len(turn_domain) == 2 and "general" in turn_domain: + turn_domain.remove("general") + if len(turn_domain) == 2: + if len(prev_turn_domain) == 1 and prev_turn_domain[0] == turn_domain[1]: + turn_domain = turn_domain[::-1] + + # get system action + for dom in turn_domain: + sys_act_dict[dom] = {} + add_to_last_collect = [] + booking_act_map = {"inform": "offerbook", "book": "offerbooked"} + for act, params in dial_turn["dialog_act"].items(): + if act == "general-greet": + continue + d, a = act.split("-") + if d == "general" and d not in sys_act_dict: + sys_act_dict[d] = {} + if d == "booking": + d = turn_domain[0] + a = booking_act_map.get(a, a) + add_p = [] + for param in params: + p = param[0] + if p == "none": + continue + elif ontology.da_abbr_to_slot_name.get(p): + p = ontology.da_abbr_to_slot_name[p] + if p not in add_p: + add_p.append(p) + add_to_last = True if a in ["request", "reqmore", "bye", "offerbook"] else False + if add_to_last: + add_to_last_collect.append((d, a, add_p)) + else: + sys_act_dict[d][a] = add_p + for d, a, add_p in add_to_last_collect: + sys_act_dict[d][a] = add_p + + for d in copy.copy(sys_act_dict): + acts = sys_act_dict[d] + if not acts: + del sys_act_dict[d] + if "inform" in acts and "offerbooked" in acts: + for s in sys_act_dict[d]["inform"]: + sys_act_dict[d]["offerbooked"].append(s) + del sys_act_dict[d]["inform"] + + ordered_sysact_dict[fn][len(dial["log"])] = sys_act_dict + + sys_act = [] + if "general-greet" in dial_turn["dialog_act"]: + sys_act.extend(["[general]", "[greet]"]) + for d, acts in sys_act_dict.items(): + sys_act += ["[" + d + "]"] + for a, slots in acts.items(): + self.unique_da[d + "-" + a] = 1 + sys_act += ["[" + a + "]"] + sys_act += slots + + # get db pointers + matnums = self.db.get_match_num(constraint_dict) + match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1] + match = matnums[match_dom] + dbvec = self.db.addDBPointer(match_dom, match) + bkvec = self.db.addBookingPointer(dial_turn["dialog_act"]) + + single_turn["pointer"] = ",".join( + [str(d) for d in dbvec + bkvec] + ) # 4 database pointer for domains, 2 for booking + single_turn["match"] = str(match) + single_turn["constraint"] = " ".join(constraints) + single_turn["cons_delex"] = " ".join(cons_delex) + single_turn["sys_act"] = " ".join(sys_act) + single_turn["turn_num"] = len(dial["log"]) + single_turn["turn_domain"] = " ".join(["[" + d + "]" for d in turn_domain]) + + prev_turn_domain = copy.deepcopy(turn_domain) + prev_constraint_dict = copy.deepcopy(constraint_dict) + + if "user" in single_turn: + dial["log"].append(single_turn) + for t in single_turn["user"].split() + single_turn["resp"].split() + constraints + sys_act: + self.vocab.add_word(t) + for t in single_turn["user_delex"].split(): + if "[" in t and "]" in t and not t.startswith("[") and not t.endswith("]"): + single_turn["user_delex"].replace(t, t[t.index("[") : t.index("]") + 1]) + elif not self.vocab.has_word(t): + self.vocab.add_word(t) + + single_turn = {} + + data[fn] = dial + # pprint(dial) + # if count == 20: + # break + self.vocab.construct() + self.vocab.save_vocab("data/preprocessed/UBAR/multi-woz-2.1-processed/vocab") + with open("data/interim/multi-woz-2.1-analysis/dialog_acts.json", "w") as f: + json.dump(ordered_sysact_dict, f, indent=2) + with open("data/interim/multi-woz-2.1-analysis/dialog_act_type.json", "w") as f: + json.dump(self.unique_da, f, indent=2) + return data + + +if __name__ == "__main__": + db_paths = { + "attraction": "db/raw/attraction_db.json", + "hospital": "db/raw/hospital_db.json", + "hotel": "db/raw/hotel_db.json", + "police": "db/raw/police_db.json", + "restaurant": "db/raw/restaurant_db.json", + "taxi": "db/raw/taxi_db.json", + "train": "db/raw/train_db.json", + } + # get_db_values('db/value_set.json') # + # preprocess_db(db_paths) + if not os.path.exists("data/preprocessed/UBAR/multi-woz-2.1-processed"): + os.mkdir("data/preprocessed/UBAR/multi-woz-2.1-processed") + dh = DataPreprocessor() + data = dh.preprocess_main() + + with open("data/preprocessed/UBAR/multi-woz-2.1-processed/data_for_ubar.json", "w") as f: + json.dump(data, f, indent=2) diff --git a/scripts/UBAR_code/train_ubar.py b/scripts/UBAR_code/train_ubar.py new file mode 100644 index 0000000000000000000000000000000000000000..f9f0c5ad064559d71253cf98b306a0246ab75fb6 --- /dev/null +++ b/scripts/UBAR_code/train_ubar.py @@ -0,0 +1,697 @@ +import argparse +import json +import logging +import os +import random +import time +import warnings + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm +from transformers import GPT2LMHeadModel, GPT2Tokenizer +from transformers.optimization import AdamW, get_linear_schedule_with_warmup + +import wandb +from crazyneuraluser.UBAR_code.config import global_config as cfg +from crazyneuraluser.UBAR_code.eval import MultiWozEvaluator +from crazyneuraluser.UBAR_code.reader import MultiWozReader + +# from config21 import global_config as cfg # global, already initialized + + +warnings.filterwarnings("ignore") + + +class Model(object): + def __init__(self, device): + self.device = device + # initialize tokenizer + self.tokenizer = GPT2Tokenizer.from_pretrained(cfg.gpt_path) + # cfg.tokenizer = tokenizer + + # initialize multiwoz reader + self.reader = MultiWozReader(self.tokenizer) + + # create model: gpt2 + self.model = GPT2LMHeadModel.from_pretrained(cfg.gpt_path) + if cfg.mode == "train": + self.model.resize_token_embeddings(len(self.tokenizer)) + self.model.to(self.device) # single gpu + + # + self.evaluator = MultiWozEvaluator(self.reader) + if cfg.save_log and cfg.mode == "train": + self.tb_writer = SummaryWriter(log_dir="./log") + else: + self.tb_writer = None + + def get_optimizers(self): + """ + Setup the optimizer and the learning rate scheduler. + + from transformers.Trainer + + parameters from cfg: lr (1e-3); warmup_steps + """ + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": cfg.weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.lr) + num_training_steps = ( + self.reader.set_stats["train"]["num_dials"] + * cfg.epoch_num + // (cfg.gradient_accumulation_steps * cfg.batch_size) + ) + num_warmup_steps = cfg.warmup_steps if cfg.warmup_steps >= 0 else int(num_training_steps * 0.2) + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + ) + return optimizer, scheduler + + def log_first_inputs(self, inputs): + tokenizer = self.tokenizer + logging.info("**** Input Examples: ****") + for context in inputs["contexts"][:4]: + # ubar = tokenizer.convert_ids_to_tokens(context) + # ubar = tokenizer.convert_tokens_to_string(context) + # ubar = " ".join(ubar) + ubar = tokenizer.decode(context) + logging.info(ubar) + + def add_torch_input(self, inputs): + # to tensor and to device + contexts_tensor = torch.from_numpy(inputs["contexts_np"]).long() + contexts_tensor = contexts_tensor.to(self.device) + inputs["contexts_tensor"] = contexts_tensor + return inputs + + def add_torch_input_eval(self, inputs): + # inputs: context + inputs["context_tensor"] = torch.tensor([inputs["context"]]).to(self.device) + return inputs + + def calculate_loss_and_accuracy(self, outputs, labels): + # GPT2-chicahat/train.py + lm_logits = outputs[0] + + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + + pad_id = cfg.pad_id + loss_fct = nn.CrossEntropyLoss(ignore_index=pad_id, reduction="sum") + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + # avg loss + not_ignore = shift_labels.ne(pad_id) + num_targets = not_ignore.long().sum().item() + + loss /= num_targets + return loss + + def train(self): + """ + UBARU + """ + + wandb.init( + # Set the project where this run will be logged + project="E2E User Simulator (Alistair)", + entity="byrne-lab", + # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) + name=cfg.wandb_train_run_name, + # Track hyperparameters and run metadata + config={ + "dataset": cfg.data_path, + "gpt_path": cfg.gpt_path, + "learning_rate": cfg.lr, + "warmup_steps": cfg.warmup_steps, + "gradient_accumulation_steps": cfg.gradient_accumulation_steps, + "batch_size": cfg.batch_size, + "epochs": cfg.epoch_num, + }, + ) + + all_batches = self.reader.get_batches("train") + # compute num_training_steps in get_batches() + optimizer, scheduler = self.get_optimizers() + + # log info + set_stats = self.reader.set_stats["train"] + logging.info("***** Running training *****") + logging.info( + " Num Training steps(one turn in a batch of dialogs) per epoch = %d", + set_stats["num_training_steps_per_epoch"], + ) + logging.info(" Num Turns = %d", set_stats["num_turns"]) + logging.info(" Num Dialogs = %d", set_stats["num_dials"]) + logging.info(" Num Epochs = %d", cfg.epoch_num) + logging.info(" Batch size = %d", cfg.batch_size) + logging.info(" Gradient Accumulation steps = %d", cfg.gradient_accumulation_steps) + logging.info( + " Total optimization steps = %d", + set_stats["num_dials"] * cfg.epoch_num // (cfg.gradient_accumulation_steps * cfg.batch_size), + ) + + # tb writer + if self.tb_writer is not None: + self.tb_writer.add_text("cfg", json.dumps(cfg.__dict__, indent=2)) + # self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) + + log_inputs = 2 + global_step = 0 + # sw = time.time() + + for epoch in range(cfg.epoch_num): + epoch_step = 0 + tr_loss = 0.0 + logging_loss = 0.0 + btm = time.time() + oom_time = 0 + self.model.zero_grad() + + data_iterator = self.reader.get_nontranspose_data_iterator(all_batches) + + for batch_idx, dial_batch in enumerate(data_iterator): + inputs = self.reader.convert_batch_session(dial_batch) + try: # avoid OOM + self.model.train() + if log_inputs > 0: # log inputs for the very first two turns + self.log_first_inputs(inputs) + log_inputs -= 1 + + # to tensor + inputs = self.add_torch_input(inputs) + # loss + outputs = self.model(inputs["contexts_tensor"]) + # outputs = self.model(inputs['contexts_tensor']) # debugging with GPT2Model + loss = self.calculate_loss_and_accuracy(outputs, labels=inputs["contexts_tensor"]) + loss.backward() + tr_loss += loss.item() + torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) + epoch_step += 1 + + # step, wrt gradient_accumulation_steps, clip grad norm + if (epoch_step + 1) % cfg.gradient_accumulation_steps == 0 or ( + # end of an epoch + (epoch_step + 1) + == set_stats["num_training_steps_per_epoch"] + ): + optimizer.step() + scheduler.step() + optimizer.zero_grad() + # global_step: actual step the optimizer took + global_step += 1 + + logs = {} # for tb writer + # logging: loss, lr... after certain amount of steps + if cfg.report_interval > 0 and global_step % cfg.report_interval == 0: + loss_scalar = (tr_loss - logging_loss) / cfg.report_interval + logging_loss = tr_loss + logs["loss"] = loss_scalar + logging.info( + "Global step: {}, epoch step: {}, interval loss: {:.4f}".format( + global_step, epoch_step, loss_scalar + ) + ) + + # validate + # add to tensorboard... + if cfg.evaluate_during_training and loss_scalar < 10: + results = self.validate(epoch) + for k, v in results.items(): + eval_key = "eval_{}".format(k) + logs[eval_key] = v + + if self.tb_writer: + for k, v in logs.items(): + self.tb_writer.add_scalar(k, v, global_step) + # save model... + + except RuntimeError as exception: + if "out of memory" in str(exception): + max_length = max(inputs["lengths"]) + oom_time += 1 + logging.info( + "WARNING: ran out of memory,times: {}, batch size: {}, max_len: {}".format( + oom_time, cfg.batch_size, max_length + ) + ) + if hasattr(torch.cuda, "empty_cache"): + torch.cuda.empty_cache() + else: + logging.info(str(exception)) + raise exception + logging.info("Train epoch time: {:.2f} min, epoch loss: {:.4f}".format((time.time() - btm) / 60, tr_loss)) + # save model after every epoch + # if epoch > 10 or tr_loss/epoch_step < 1: + self.save_model(epoch, tr_loss / epoch_step) + + wandb.log({"epoch loss": tr_loss}) + + # Mark the run as finished on wandb + wandb.finish() + + def save_model(self, epoch, loss): + save_path = os.path.join(cfg.exp_path, "epoch{}_trloss{:.2f}_gpt2".format(epoch + 1, loss)) + if not os.path.exists(save_path): + os.mkdir(save_path) + logging.info("Saving model checkpoint to %s", save_path) + # save gpt2 + self.model.save_pretrained(save_path) + # save tokenizer + self.tokenizer.save_pretrained(save_path) + # save cfg + + def validate(self, data="dev", do_test=False, epoch=0): + + if cfg.mode != "train": + wandb.init( + # Set the project where this run will be logged + project="E2E User Simulator (Alistair)", + entity="byrne-lab", + # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) + name=cfg.wandb_eval_run_name, + # Track hyperparameters and run metadata + config={ + "eval_load_path": cfg.eval_load_path, + "dataset": cfg.data_path, + "gpt_path": cfg.gpt_path, + "learning_rate": cfg.lr, + "warmup_steps": cfg.warmup_steps, + "gradient_accumulation_steps": cfg.gradient_accumulation_steps, + "batch_size": cfg.batch_size, + "epochs": cfg.epoch_num, + "data": data, + }, + ) + + test_data_at = wandb.Artifact(str(wandb.run.id + str(epoch)), type="predictions") + + # Create your W&B Table + column_names = [ + "dialog", + "turn_num", + "turn_domain", + "pointer", + "user", + "usdx", + "resp", + "bspn", + "bsdx", + "aspn", + "dspn", + "db", + "resp_gen", + "bspn_gen", + "aspn_gen", + "dspn_gen", + ] + val_table = wandb.Table(columns=column_names) + + # predict one dialog/ one turn at a time + self.model.eval() + + # all_batches = self.reader.get_batches('dev') + # data_iterator = self.reader.get_data_iterator(all_batches) + eval_data = self.reader.get_eval_data(data) + + set_stats = self.reader.set_stats[data] + logging.info("***** Running Evaluation *****") + logging.info(" Num Turns = %d", set_stats["num_turns"]) + # logging.info(" Num Dialogs = %d", set_stats['num_dials']) + + # valid_losses = [] + btm = time.time() + result_collection = {} + with torch.no_grad(): + # Adding this index to allow for quick testing of evaluation + dialogues_to_run = 1 + for dial_idx, dialog in tqdm(enumerate(eval_data)): + if dialogues_to_run == 0: + break + dialogues_to_run -= 1 + + pv_turn = {} + for turn_idx, turn in enumerate(dialog): + first_turn = turn_idx == 0 + inputs = self.reader.convert_turn_eval(turn, pv_turn, first_turn) + inputs = self.add_torch_input_eval(inputs) + + # fail to generate new tokens, if max_length not set + context_length = len(inputs["context"]) + if cfg.use_true_curr_bspn: # generate act, response + max_len = 60 + if not cfg.use_true_curr_aspn: + max_len = 80 + + outputs = self.model.generate( + input_ids=inputs["context_tensor"], + max_length=context_length + max_len, + temperature=0.7, # top_p=0.9, num_beams=4, + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.encode([""])[0], + ) + # no_repeat_ngram_size=4 + # turn['generated'] = self.tokenizer.decode(outputs[0]) + + # resp_gen, need to trim previous context + generated = outputs[0].cpu().numpy().tolist() + generated = generated[context_length - 1 :] + + try: + decoded = self.decode_generated_act_resp(generated) + except ValueError as exception: + logging.info(str(exception)) + logging.info(self.tokenizer.decode(generated)) + decoded = {"resp": [], "bspn": [], "aspn": []} + + else: # predict bspn, access db, then generate act and resp + outputs = self.model.generate( + input_ids=inputs["context_tensor"], + max_length=context_length + 60, + temperature=0.7, # top_p=0.9, num_beams=4, + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.encode([""])[0], + ) + generated_bs = outputs[0].cpu().numpy().tolist() + # generated_bs = generated_bs[context_length-1:] + bspn_gen = self.decode_generated_bspn(generated_bs[context_length - 1 :]) + # check DB result + if cfg.use_true_db_pointer: + # db_result = self.reader.bspan_to_DBpointer( + # self.tokenizer.decode(turn['bspn']), turn['turn_domain']) + db = turn["db"] + else: + db_result = self.reader.bspan_to_DBpointer( + self.tokenizer.decode(bspn_gen), turn["turn_domain"] + ) + db = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + db_result + " ") + ) + self.tokenizer.encode([""]) + inputs["context_tensor_db"] = torch.tensor([inputs["context"][:-1] + bspn_gen + db]).to( + self.device + ) + context_length = len(inputs["context_tensor_db"][0]) + outputs_db = self.model.generate( + input_ids=inputs["context_tensor_db"], + max_length=context_length + 80, + temperature=0.7, # top_p=0.9, num_beams=4, + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.encode([""])[0], + ) + generated_ar = outputs_db[0].cpu().numpy().tolist() + generated_ar = generated_ar[context_length - 1 :] + try: + decoded = self.decode_generated_act_resp(generated_ar) + decoded["bspn"] = bspn_gen + except ValueError: + # NOTE: the below logging is commented out because when running evaluation + # on early checkpoints of gpt2, the generated response is almost always + # missing and it kills the GPU due to constant decoding (plus it swamps the logs) + + # logging.info(str(exception)) + # logging.info(self.tokenizer.decode(generated_ar)) + decoded = {"resp": [], "bspn": [], "aspn": []} + + turn["resp_gen"] = decoded["resp"] + turn["bspn_gen"] = turn["bspn"] if cfg.use_true_curr_bspn else decoded["bspn"] + turn["aspn_gen"] = turn["aspn"] if cfg.use_true_curr_aspn else decoded["aspn"] + turn["dspn_gen"] = turn["dspn"] + + # check DB results + # db_result = self.reader.bspan_to_DBpointer(self.tokenizer.decode(turn['bspn']), + # turn['turn_domain']) + # if db_result[0] == 1: # no match + # print('gt:', self.tokenizer.decode(turn['aspn']), ' + # |gen:', self.tokenizer.decode(decoded['aspn'])) + # print('gen_resp: ', self.tokenizer.decode(decoded['resp'])) + # print('gt_resp: ', self.tokenizer.decode(turn['resp']), '\n') + + # all true previous context + pv_turn["labels"] = inputs["labels"] + pv_turn["resp"] = turn["resp"] if cfg.use_true_prev_resp else decoded["resp"] + pv_turn["bspn"] = turn["bspn"] if cfg.use_true_prev_bspn else decoded["bspn"] + pv_turn["db"] = turn["db"] if cfg.use_true_curr_bspn else db + pv_turn["aspn"] = turn["aspn"] if cfg.use_true_prev_aspn else decoded["aspn"] + + turn_result = self.reader.inverse_transpose_turn(dialog) + result_collection.update(turn_result) + + for dialog, turns in turn_result.items(): + for turn in turns: + curr_turn_plain = [ + dialog, + turn["turn_num"], + turn["turn_domain"], + turn["pointer"], + ] + curr_turn_tokenised = [ + self.tokenizer.decode(turn[key]) + for key in turn.keys() + if key != "pointer" and key != "turn_domain" and key != "turn_num" + ] + curr_turn_data = curr_turn_plain + curr_turn_tokenised + val_table.add_data(*curr_turn_data) + + logging.info("inference time: {:.2f} min".format((time.time() - btm) / 60)) + # score + btm = time.time() + results, _ = self.reader.wrap_result_lm(result_collection) + bleu, success, match = self.evaluator.validation_metric(results) + logging.info("Scoring time: {:.2f} min".format((time.time() - btm) / 60)) + score = 0.5 * (success + match) + bleu + # valid_loss = 130 - score + logging.info( + "validation [CTR] match: %2.2f success: %2.2f bleu: %2.2f score: %.2f" % (match, success, bleu, score) + ) + eval_results = {} + eval_results["bleu"] = bleu + eval_results["success"] = success + eval_results["match"] = match + eval_results["score"] = score + eval_results["result"] = "validation [CTR] match: %2.2f success: %2.2f bleu: %2.2f score: %.2f" % ( + match, + success, + bleu, + score, + ) + + wandb.log( + { + "bleu": eval_results["bleu"], + "success": eval_results["success"], + "match": eval_results["match"], + "score": eval_results["score"], + } + ) + + model_setting, epoch_setting = ( + cfg.eval_load_path.split("/")[1], + cfg.eval_load_path.split("/")[2], + ) + eval_on = "-".join(cfg.exp_domains) + if data == "test": + eval_on += "_test" + if not os.path.exists(cfg.log_path): + os.mkdir(cfg.log_path) + log_file_name = os.path.join(cfg.log_path, model_setting + "-" + eval_on + ".json") + if os.path.exists(log_file_name): + eval_to_json = json.load(open(log_file_name, "r")) + eval_to_json[epoch_setting] = eval_results + json.dump(eval_to_json, open(log_file_name, "w"), indent=2) + else: + eval_to_json = {} + eval_to_json[epoch_setting] = eval_results + json.dump(eval_to_json, open(log_file_name, "w"), indent=2) + logging.info("update eval results to {}".format(log_file_name)) + + # log predictions table to wandb, giving it a name + test_data_at.add(val_table, "predictions") + wandb.run.log_artifact(test_data_at) + + if cfg.mode != "train": + # Mark the run as finished on wandb + wandb.finish() + + return eval_results + + def decode_generated_act_resp(self, generated): + """ + decode generated + return decoded['resp'] ('bspn', 'aspn') + """ + decoded = {} + eos_a_id = self.tokenizer.encode([""])[0] + eos_r_id = self.tokenizer.encode([""])[0] + # eos_b_id = self.tokenizer.encode([""])[0] + + # eos_r may not exists if gpt2 generated repetitive words. + if eos_r_id in generated: + eos_r_idx = generated.index(eos_r_id) + else: + eos_r_idx = len(generated) - 1 + # NOTE: the below logging is commented out because when running evaluation + # on early checkpoints of gpt2, the generated response is almost always missing + # and it kills the GPU due to constant decoding (plus it swamps the logs) + + # logging.info('eos_r not in generated: ' + + # self.tokenizer.decode(generated)) + + if cfg.use_true_curr_aspn: # only predict resp + decoded["resp"] = generated[: eos_r_idx + 1] + else: # predicted aspn, resp + eos_a_idx = generated.index(eos_a_id) + decoded["aspn"] = generated[: eos_a_idx + 1] + decoded["resp"] = generated[eos_a_idx + 1 : eos_r_idx + 1] + # if cfg.use_true_curr_bspn: + + # else: # predict bspn aspn resp + # eos_b_idx = generated.index(eos_b_id) + # eos_a_idx = generated.index(eos_a_id) + # decoded['bspn'] = generated[: eos_b_idx+1] + # decoded['aspn'] = generated[eos_b_idx+1: eos_a_idx+1] + # decoded['resp'] = generated[eos_a_idx+1: eos_r_idx+1] + return decoded + + def decode_generated_bspn(self, generated): + eos_b_id = self.tokenizer.encode([""])[0] + if eos_b_id in generated: + eos_b_idx = generated.index(eos_b_id) + else: + eos_b_idx = len(generated) - 1 + return generated[: eos_b_idx + 1] + + +def parse_arg_cfg(args): + # add args to cfg + if args.cfg: + for pair in args.cfg: + k, v = tuple(pair.split("=")) + dtype = type(getattr(cfg, k)) + if dtype == type(None): + raise ValueError() + if dtype is bool: + v = False if v == "False" else True + elif dtype is list: + v = v.split(",") + if k == "cuda_device": + v = [int(no) for no in v] + else: + v = dtype(v) + setattr(cfg, k, v) + return + + +def main(): + if not os.path.exists("./models/UBAR/experiments"): + os.mkdir("./models/UBAR/experiments") + + if not os.path.exists("./models/UBAR/experiments_21"): + os.mkdir("./models/UBAR/experiments_21") + + parser = argparse.ArgumentParser() + parser.add_argument("-mode") + parser.add_argument("-cfg", nargs="*") + args = parser.parse_args() + + cfg.mode = args.mode + if args.mode == "test" or args.mode == "adjust": + parse_arg_cfg(args) + # cfg.model_path = cfg.eval_load_path + cfg.gpt_path = cfg.eval_load_path + else: # train + + parse_arg_cfg(args) + if cfg.exp_path in ["", "to be generated"]: + # log file path, control the factors: seed, learning_rate, batch_size, + # early_stop_count, weight decay... cfg.exp_path = 'experiments/ + # {}_{}_sd{}_lr{}_bs{}_sp{}_dc{}/'.format('-'.join(cfg.exp_domains), + # cfg.exp_no, cfg.seed, cfg.lr, cfg.batch_size, + # cfg.early_stop_count, cfg.weight_decay_count) + + experiments_path = ( + "./models/UBAR/experiments" if "all" in cfg.exp_domains else "./models/experiments_Xdomain" + ) + cfg.exp_path = os.path.join( + experiments_path, + "{}_{}_sd{}_lr{}_bs{}_ga{}".format( + "-".join(cfg.exp_domains), + cfg.exp_no, + cfg.seed, + cfg.lr, + cfg.batch_size, + cfg.gradient_accumulation_steps, + ), + ) + logging.info("save path:", cfg.exp_path) + if cfg.save_log: + if not os.path.exists(cfg.exp_path): + os.mkdir(cfg.exp_path) + + # to gpt later + cfg.model_path = os.path.join(cfg.exp_path, "model.pkl") + cfg.result_path = os.path.join(cfg.exp_path, "result.csv") + cfg.vocab_path_eval = os.path.join(cfg.exp_path, "vocab") + cfg.eval_load_path = cfg.exp_path + + cfg._init_logging_handler(args.mode) + if cfg.cuda: + if len(cfg.cuda_device) == 1: + cfg.multi_gpu = False + # torch.cuda.set_device(cfg.cuda_device[0]) + device = torch.device("cuda:{}".format(cfg.cuda_device[0])) + else: + pass # multi-gpu + else: + device = torch.device("cpu") + # logging.info('Device: {}'.format(torch.cuda.current_device())) + + # fix random seed + torch.manual_seed(cfg.seed) + torch.cuda.manual_seed(cfg.seed) + random.seed(cfg.seed) + np.random.seed(cfg.seed) + + # initialize model + m = Model(device) + + if args.mode == "train": # train + if cfg.save_log: # save cfg details. + pass + m.train() + else: # test + logging.info( + "Generate setting: \n\t use true_prev_bspn={} \n\t use true_prev_aspn={} \n\t use true_db_pointer={} \ + \n\t use true_prev_resp={} \n\t use true_curr_bspn={} \n\t use true_curr_aspn={} \ + \n\t use_all_previous_context={}".format( + cfg.use_true_prev_bspn, + cfg.use_true_prev_aspn, + cfg.use_true_db_pointer, + cfg.use_true_prev_resp, + cfg.use_true_curr_bspn, + cfg.use_true_curr_aspn, + cfg.use_all_previous_context, + ) + ) + + logging.info("Running eval on test") + m.validate(cfg.eval_set) + logging.info("Evaluation finished") + + +if __name__ == "__main__": + main() diff --git a/scripts/agent_agent.yaml b/scripts/agent_agent.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/crazyneuraluser.egg-info/PKG-INFO b/scripts/crazyneuraluser.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..57801f654b703986ac333e7048f31b3f885685f2 --- /dev/null +++ b/scripts/crazyneuraluser.egg-info/PKG-INFO @@ -0,0 +1,171 @@ +Metadata-Version: 2.1 +Name: crazyneuraluser +Version: 0.0.post1.dev55+g3c295fb.d20220606 +Summary: Add a short description here! +Home-page: https://github.com/pyscaffold/pyscaffold/ +Author: Extended by Alistair McLeay, original code by Alexandru Coca +Author-email: am@alistairmcleay.com and alexcoca23@yahoo.co.uk +License: MIT +Project-URL: Documentation, https://pyscaffold.org/ +Platform: any +Classifier: Development Status :: 4 - Beta +Classifier: Programming Language :: Python +Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM +Provides-Extra: testing +License-File: LICENSE.txt +License-File: AUTHORS.md + +# Cambridge Masters Project +Joint Learning of Practical Dialogue Systems and User Simulators + +## Environment setup + +1. Create an environment `crazyneuraluser` with the help of [conda] + ``` + conda env create -f environment.yml + ``` +2. Activate the new environment with: + ``` + conda activate crazyneuraluser + ``` +3. Install a version of `pytorch` compatible with your hardware (see the [pytorch website](https://pytorch.org/get-started/previous-versions/)). E.g.: + ``` + pip install torch --extra-index-url https://download.pytorch.org/whl/cu113 + ``` + +4. Install `spacy` and download the tokenization tool in spacy: + ``` + pip install spacy' + python -m spacy download en_core_web_sm + ``` + +### Generating dialogues through agent-agent interaction + +To generate dialogues, first change working directory to the `baselines` directory. Run the command + ``` + python baselines_setup.py + ``` +to prepare `convlab2` for running the baselines. + +#### Generating dialogues conditioned on randomly sampled goals + +Select one of the available configurations in the `configs` directory and run the command + ``` + python simulate_agent_interaction.py --config /rel/path/to/chosen/config + ``` +to generate dialogues conditioned on randomly sampled goals according to the `convlab2` goal model. The dialogues will be be saved automatically in the `models` directory, under a directory whose name depends on the configuration run. The `models` directory is located in the parent directory of the `baselines` directory. The `metadata.json` file saved with the dialogues contains information about the data generation process. + +#### Generating dialogues conditioned on `MultiWOZ2.1` goals + +To generate the entire corpus, simply pass the `--goals-path /path/to/multiwoz2.1/data.json/file` flag to `simulate_agent_interaction.py`. To generate the `test/val` split additionally pass the `--filter-path /path/to/multiwoz2.1/test-or-valListFile` argument to `simulate_agent_interaction.py`. You can use the `generate_multiwoz21_train_id_file` function in `baselines/utils.py` to generate `trainListFile` which can then be passed via the `--filter-path` argument to the dialogue generation script in order to generate dialogues conditioned on the `MultiWOZ2.1` training goals. + +### Converting the generated dialogues to SGD-like format + +The `create_data_from_multiwoz.py` script can be used to convert the generated dialogues to SGD format, necessary for evaluation. It is based on the script provided by Google for DSTC8, but with additional functionality such as: + + - conversion of slot names as annotated in the MultiWOZ 2.1 dialogue acts to different slot names, specified through the `--slots_convention` argument. Options are `multiwoz22` to convert the slots to the same slots as defined in the MultiWOZ 2.2 dataset whreas the `multiwoz_goals` converts the slot names to the names used in the dialogue goal and state tracking annotations. + + - addition of system and user `nlu` fields for every turn + + - option to perform cleaning operations on the goals to ensure a standard format is received by the evaluator. + +The conversion is done according to the `schema.json` file in the `baselines` directory, which is the same as used by `DSTC8` conversion except for the addition of the `police` domain. Type ``python create_data_from_multiwoz.py --helpfull`` to see a full list of flags and usage. + +## Installation + +The recommended way to use this repository is to develop the core code under `src/crazyneuraluser`. The experiments/exporatory analysis making use of the core package code should be placed outside the library and imported. See more guidance under the [Project Organisation](#project-organization) section below. + +To create an environment for the package, make sure you have deactivated all `conda` environments. Then: + +1. Create an environment `crazyneuraluser` with the help of [conda]: + ``` + conda env create -f environment.yml + ``` +2. Add the developer dependencies to this environment with the help of [conda]: + ``` + conda env update -f dev_environment.yml + ``` + +Optional and needed only once after `git clone`: + +3. install several [pre-commit] git hooks with: + ```bash + pre-commit install + # You _are encouraged_ to run `pre-commit autoupdate` + ``` + and checkout the configuration under `.pre-commit-config.yaml`. + The `-n, --no-verify` flag of `git commit` can be used to deactivate pre-commit hooks temporarily. + +4. install [nbstripout] git hooks to remove the output cells of committed notebooks with: + ```bash + nbstripout --install --attributes notebooks/.gitattributes + ``` + This is useful to avoid large diffs due to plots in your notebooks. + A simple `nbstripout --uninstall` will revert these changes. + +Then take a look into the `scripts` and `notebooks` folders. + +## Dependency Management & Reproducibility + +1. Always keep your abstract (unpinned) dependencies updated in `environment.yml` and eventually + in `setup.cfg` if you want to ship and install your package via `pip` later on. +2. Create concrete dependencies as `environment.lock.yml` for the exact reproduction of your + environment with: + ```bash + conda env export -n crazyneuraluser -f environment.lock.yml + ``` + For multi-OS development, consider using `--no-builds` during the export. +3. Update your current environment with respect to a new `environment.lock.yml` using: + ```bash + conda env update -f environment.lock.yml --prune + ``` +## Project Organization + +``` +├── AUTHORS.md <- List of developers and maintainers. +├── CHANGELOG.md <- Changelog to keep track of new features and fixes. +├── LICENSE.txt <- License as chosen on the command-line. +├── README.md <- The top-level README for developers. +├── configs <- Directory for configurations of model & application. +├── data +│ ├── external <- Data from third party sources. +│ ├── interim <- Intermediate data that has been transformed. +│ ├── processed <- The final, canonical data sets for modeling. +│ └── raw <- The original, immutable data dump. +├── docs <- Directory for Sphinx documentation in rst or md. +├── environment.yml <- The conda environment file for reproducibility. +├── models <- Trained and serialized models, model predictions, +│ or model summaries. +├── notebooks <- Jupyter notebooks. Naming convention is a number (for +│ ordering), the creator's initials and a description, +│ e.g. `1.0-fw-initial-data-exploration`. +├── pyproject.toml <- Build system configuration. Do not change! +├── references <- Data dictionaries, manuals, and all other materials. +├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. +│ └── figures <- Generated plots and figures for reports. +├── scripts <- Analysis and production scripts which import the +│ actual Python package, e.g. train_model.py. +├── setup.cfg <- Declarative configuration of your project. +├── setup.py <- Use `pip install -e .` to install for development or +| or create a distribution with `tox -e build`. +├── src +│ └── crazyneuraluser <- Actual Python package where the main functionality goes. +├── tests <- Unit tests which can be run with `py.test`. +├── .coveragerc <- Configuration for coverage reports of unit tests. +├── .isort.cfg <- Configuration for git hook that sorts imports. +└── .pre-commit-config.yaml <- Configuration of pre-commit git hooks. +``` + + + +## Note + +This project has been set up using [PyScaffold] 4.0.1 and the [dsproject extension] 0.6.1. + +[conda]: https://docs.conda.io/ +[pre-commit]: https://pre-commit.com/ +[Jupyter]: https://jupyter.org/ +[nbstripout]: https://github.com/kynan/nbstripout +[Google style]: http://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings +[PyScaffold]: https://pyscaffold.org/ +[dsproject extension]: https://github.com/pyscaffold/pyscaffoldext-dsproject diff --git a/scripts/crazyneuraluser.egg-info/SOURCES.txt b/scripts/crazyneuraluser.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/crazyneuraluser.egg-info/dependency_links.txt b/scripts/crazyneuraluser.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/scripts/crazyneuraluser.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/scripts/crazyneuraluser.egg-info/not-zip-safe b/scripts/crazyneuraluser.egg-info/not-zip-safe new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/scripts/crazyneuraluser.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/scripts/crazyneuraluser.egg-info/requires.txt b/scripts/crazyneuraluser.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..76d0937d5858b90bcdf751972eda6e6fd4ee3243 --- /dev/null +++ b/scripts/crazyneuraluser.egg-info/requires.txt @@ -0,0 +1,15 @@ +transformers==4.18.0 +tqdm==4.64.0 +wandb==0.12.16 +nltk==3.7 +sklearn==0.0 +tensorboard==2.9.0 +spacy==3.3.0 + +[:python_version < "3.8"] +importlib-metadata + +[testing] +setuptools +pytest +pytest-cov diff --git a/scripts/crazyneuraluser.egg-info/top_level.txt b/scripts/crazyneuraluser.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..23688a9c911232254dc98bb735cd279b9fb842ba --- /dev/null +++ b/scripts/crazyneuraluser.egg-info/top_level.txt @@ -0,0 +1 @@ +crazyneuraluser diff --git a/scripts/simulate_interaction.py b/scripts/simulate_interaction.py new file mode 100644 index 0000000000000000000000000000000000000000..67790e2c714a73e85c56839787477f1b46ca178b --- /dev/null +++ b/scripts/simulate_interaction.py @@ -0,0 +1,171 @@ +import sys +import traceback +import pandas as pd + +# from tqdm import tqdm +from UBAR_code.interaction import UBAR_interact +from user_model_code.interaction import multiwoz_interact +from UBAR_code.interaction.UBAR_interact import bcolors + + +def instantiate_agents(): + + UBAR_checkpoint_path = "models/UBAR/experiments/distilgpt-2_sd11_lr0.0001_bs16_ga2/epoch50_trloss0.59_gpt2" + user_model_checkpoint_path = "models/user_model/MultiWOZ-full_checkpoint_step340k" + + sys_model = UBAR_interact.UbarSystemModel( + "UBAR_sys_model", UBAR_checkpoint_path, "scripts/UBAR_code/interaction/config.yaml" + ) + + user_model = multiwoz_interact.NeuralAgent( + "user", user_model_checkpoint_path, "scripts/user_model_code/interaction/config.yaml" + ) + + return sys_model, user_model + + +def read_multiwoz_data(): + """ + Read the multiwoz 2.0 raw data from the .json file + """ + raw_mwoz_20_path = "data/raw/UBAR/multi-woz/data.json" + df_raw_mwoz = pd.read_json(raw_mwoz_20_path) + return df_raw_mwoz + + +def load_test_val_lists(): + val_list_file = "data/raw/UBAR/multi-woz/valListFile.json" + test_list_file = "data/raw/UBAR/multi-woz/testListFile.json" + + with open(val_list_file, "r") as f: + val_list = f.readlines() + val_list = [x.strip() for x in val_list] + + with open(test_list_file, "r") as f: + test_list = f.readlines() + test_list = [x.strip() for x in test_list] + + return val_list, test_list + + +def main( + write_to_file=False, ground_truth_system_responses=False, train_only=True, n_dialogues="all", log_successes=False +): + sys_model, user_model = instantiate_agents() + + # TODO: move hardcoded vars into config file + raw_mwoz_20_path = "data/raw/UBAR/multi-woz/data.json" + user_utterances_out_path = "data/preprocessed/UBAR/user_utterances_from_simulator.txt" + logging_successes_path = "data/preprocessed/UBAR/logging_successes" + sys_model.print_intermediary_info = False + user_model.print_intermediary_info = False + + df_raw_mwoz = pd.read_json(raw_mwoz_20_path) + if n_dialogues == "all": + n_dialogues = len(df_raw_mwoz.columns) + + curr_dialogue_user_utterances_formatted = [] + + print("Loading goals...") + goals = multiwoz_interact.read_multiWOZ_20_goals(raw_mwoz_20_path, n_dialogues) + + # Write column headers + if write_to_file: + with open(user_utterances_out_path, "w") as f: + f.write("Dialogue #\tDialogue ID\tTurn #\tSystem Response\n") + + print("Loading data...") + df_mwoz_data = read_multiwoz_data() + val_list, test_list = load_test_val_lists() + + successful_dialogues = 0 + total_dialogues_generated = 0 # train dialogues only + for dialogue_idx, (goal, dialogue_filename) in enumerate(zip(goals, df_mwoz_data.columns)): + if log_successes: + # log successful_dialogues to logging_successes_path every 100 dialogues + if dialogue_idx % 100 == 0: + with open(logging_successes_path, "w") as f: + f.write(str(successful_dialogues) + " / " + str(total_dialogues_generated)) + + curr_dialogue_user_utterances_formatted = [] + if train_only: + if dialogue_filename in val_list or dialogue_filename in test_list: + continue + + total_dialogues_generated += 1 + print("Dialogue: {}".format(dialogue_filename)) + + # There are occasionally exceptions thrown from one of the agents, usually the user + # In this case we simply continue to the next dialogue + try: + # Reset state after each dialogue + sys_model.init_session() + user_model.init_session(ini_goal=goal) + sys_response = "" + + for turn_idx in range(50): + # Turn idx in this case represents the turn as one user utterance AND one system response + usr_response_raw_data_idx = turn_idx * 2 + sys_response_raw_data_idx = turn_idx * 2 + 1 + + user_utterance = user_model.response(sys_response) + print(bcolors.OKBLUE + "User: " + bcolors.ENDC + user_utterance) + + if write_to_file: + user_utterance = user_utterance.replace("\n", " ") + curr_dialogue_user_utterances_formatted.append( + str(dialogue_idx) + + "\t" + + dialogue_filename + + "\t" + + str(usr_response_raw_data_idx) + + "\t" + + user_utterance + + "\n" + ) + + if user_model.is_terminated(): + successful_dialogues += 1 + print(bcolors.OKCYAN + "Dialogue terminated successfully!" + bcolors.ENDC) + print(bcolors.OKCYAN + "---" * 30 + bcolors.ENDC + "\n") + if write_to_file: + # Write whole dialogue to file + with open(user_utterances_out_path, "a") as f: + for line in curr_dialogue_user_utterances_formatted: + f.write(line) + break + + # Next turn materials + if ground_truth_system_responses: + # If we are at the end of the ground truth dialogues + if len(df_mwoz_data.iloc[:, dialogue_idx].log) <= sys_response_raw_data_idx: + print(bcolors.RED + "Dialogue terminated unsuccessfully!" + bcolors.ENDC) + print(bcolors.RED + "---" * 30 + bcolors.ENDC + "\n") + break + sys_response = df_mwoz_data.iloc[:, dialogue_idx].log[sys_response_raw_data_idx]["text"] + else: + sys_response = sys_model.response(user_utterance, turn_idx) + capitalised_sys_response = sys_response[0].upper() + sys_response[1:] + print(bcolors.GREEN + "System: " + bcolors.ENDC + capitalised_sys_response) + + except Exception: + print(bcolors.RED + "*" * 30 + bcolors.ENDC) + print(bcolors.RED + "Error in dialogue {}".format(dialogue_filename) + bcolors.ENDC) + print(bcolors.RED + "*" * 30 + bcolors.ENDC) + traceback.print_exc() + continue + + print("Successful dialogues: {}".format(successful_dialogues)) + print("Total dialogues: {}".format(n_dialogues)) + print("% Successful Dialopues: {}".format(successful_dialogues / n_dialogues)) + + +if __name__ == "__main__": + # TODO: move parameters to config file + # Fix the hacky mess below + ground_truth_system_responses = sys.argv[1] + if ground_truth_system_responses == "False": + ground_truth_system_responses = False + else: + ground_truth_system_responses = True + main(write_to_file=False, ground_truth_system_responses=ground_truth_system_responses) diff --git a/scripts/template_train_model.py b/scripts/template_train_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9b21fa039c60e7dc7375ce6541f7005b3082a36b --- /dev/null +++ b/scripts/template_train_model.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +import logging +import sys +from pathlib import Path + +import click +from IPython.core import ultratb + +import crazyneuraluser + +# fallback to debugger on error +sys.excepthook = ultratb.FormattedTB(mode="Verbose", color_scheme="Linux", call_pdb=1) +# turn UserWarning messages to errors to find the actual cause +# import warnings +# warnings.simplefilter("error") + +_logger = logging.getLogger(__name__) + + +@click.command() +@click.option( + "-c", + "--config", + "cfg_path", + required=True, + type=click.Path(exists=True), + help="path to config file", +) +@click.option("--quiet", "log_level", flag_value=logging.WARNING, default=True) +@click.option("-v", "--verbose", "log_level", flag_value=logging.INFO) +@click.option("-vv", "--very-verbose", "log_level", flag_value=logging.DEBUG) +@click.version_option(crazyneuraluser.__version__) +def main(cfg_path: Path, log_level: int): + logging.basicConfig( + stream=sys.stdout, + level=log_level, + datefmt="%Y-%m-%d %H:%M", + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + # YOUR CODE GOES HERE! Keep the main functionality in src/crazyneuraluser + # est = crazyneuraluser.models.Estimator() + + +if __name__ == "__main__": + main() diff --git a/scripts/user_model_code/__init__.py b/scripts/user_model_code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/user_model_code/decode.sh b/scripts/user_model_code/decode.sh new file mode 100644 index 0000000000000000000000000000000000000000..8285489f8f4c9b3ab700c9418d68097541626d3a --- /dev/null +++ b/scripts/user_model_code/decode.sh @@ -0,0 +1,37 @@ +experiment=$1 +checkpoint=$2 + +if [[ "$experiment" == "SGD" ]]; then + echo "Conduct experiment with SGD dataset" + job_name='SGD-full' + data_list="sgd" # 165k training examples + eval_interval=50000 # evaluation interval + +elif [[ "$experiment" == "MultiWOZ" ]]; then + echo "Conduct experiment with MulwiWOZ dataset" + job_name='MultiWOZ-full' + data_list="multiwoz" # 56k training examples + eval_interval=20000 + +elif [[ "$experiment" == "Joint" ]]; then + echo "Conduct experiment with SGD + MulwiWOZ dataset" + job_name='Joint-full' + data_list="sgd multiwoz" # 221k training examples + eval_interval=70000 + +else + echo "Unrecognised argument" + exit +fi + +mkdir -p log decode +decode_file='decode/'$job_name'.json' +eye_browse_output=true # set to false for storing generation results in file + +python main.py --mode='testing' \ + --model_name=$job_name \ + --checkpoint=$checkpoint \ + --decode_file=$decode_file \ + --data_dir="processed_data" \ + --data_list=$data_list \ + --eye_browse_output=$eye_browse_output diff --git a/scripts/user_model_code/interaction/__init__.py b/scripts/user_model_code/interaction/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/user_model_code/interaction/config.yaml b/scripts/user_model_code/interaction/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d768ef7647980ec6eee24c4ef927e2503a5b8a21 --- /dev/null +++ b/scripts/user_model_code/interaction/config.yaml @@ -0,0 +1,12 @@ +model: + path: "./models/user_model/MultiWOZ-full_checkpoint_step340k" + goal_update: + finish_inform: "loose" # loose or strict + +schema_path: "scripts/user_model_code/interaction/schema.json" + +decode: + dec_max_len: 1024 + num_beams: 1 + temperature: 0.7 + do_sample: False diff --git a/scripts/user_model_code/interaction/multiwoz_interact.py b/scripts/user_model_code/interaction/multiwoz_interact.py new file mode 100644 index 0000000000000000000000000000000000000000..50895f836806ef19a8bf6630d23c6f21148aea8c --- /dev/null +++ b/scripts/user_model_code/interaction/multiwoz_interact.py @@ -0,0 +1,1034 @@ +import copy +import random +import re +import sys +import traceback +from typing import List + +import numpy as np +import pandas as pd +import torch +import transformers +from omegaconf import OmegaConf +from transformers import GPT2LMHeadModel, GPT2Tokenizer +from .utils import add_str, bcolors, find_segment, load_schema, segment_gen, wrap_element + + +class DummyPolicy: + def init_session(self, ini_goal): # noqa + self.goal = ini_goal # noqa + + def get_goal(self) -> dict: + """Returns current user goal. + + Notes + ----- + ``hasattr`` user works around the fact that ``convlab2`` initialises the dialogue session + before we can explicitly pass the goal to the user model. + """ + if hasattr(self.goal, "domain_goals"): + return self.goal.domain_goals + # return {} + return self.goal # for consistency + + +def generation_func(model, input_ids, eos_id, dec_max_len): + """Generation method using greedy search for Transformer v2.x""" + + def _extend_mask(mask): + mask = torch.cat([mask, mask.new_ones((mask.shape[0], 1))], dim=-1) + return mask + + # input_ids, attention_mask, token_type_ids = batch['input_ids'], batch['attention_mask'], batch['token_type_ids'] + batch_size = input_ids.size(0) + attention_mask = torch.ones_like(input_ids) + past = None + finish_sent = [False for _ in range(batch_size)] + for i in range(dec_max_len): + logits, past = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=None).values() + + # logits: (B, T, V), T=1 when past is passed + next_token_logits = logits[:, -1, :] + next_token = torch.argmax(next_token_logits, dim=-1) + input_ids = torch.cat([input_ids, next_token.unsqueeze(-1)], dim=-1) + attention_mask = _extend_mask(attention_mask) + + for bs_idx, token_id in enumerate(next_token): + if finish_sent[bs_idx] is False and token_id.item() == eos_id: # first produce + finish_sent[bs_idx] = True + if sum(finish_sent) == batch_size: + break + return input_ids + + +class NeuralAgent: # crazyusermodel + def __init__(self, name: str, model_path: str, model_config_path: str): + """User Simulator + Description + --------- + A user model that is able to chat with the task-oriented dialogue system in an end-to-end manner + + Parameters + ---------- + name + Should indicate the role played by the agent. It should be always user + """ + + if name != "user": + raise ValueError(f"Expected name 'user' but got {name} instead.") + + # load necessities + self.set_device() + self.config = OmegaConf.load(model_config_path) + + self.print_intermediary_info = False + + # get schema, which is dependent to dataset, only for providing task description here + self.service2meta, self.schema_intents, self.schema_slots = load_schema(self.config["schema_path"]) + # self.load_checkpoint_and_tokenizer(self.config["model"]["path"]) + self.load_checkpoint_and_tokenizer(model_path) + self.load_materials() + + self.context = [] + self.current_goal = {} + self.behaviour_params = {} + self.input_action = [] # type: list[list[str]] + self.output_action = [] # type: list[list[str]] + + # for compatibility with convlab2 evaluator + self.policy = DummyPolicy() + + """ for reproduction """ + seed = 1130 + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enabled = False + torch.backends.cudnn.benchmark = False + + def load_checkpoint_and_tokenizer(self, checkpoint_path: str) -> None: + """Load model checkpoint with the model tokenizer, only for GPT2 for now""" + print("Load model, tokenizer from {}".format(checkpoint_path)) + self.tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_path) + self.model = GPT2LMHeadModel.from_pretrained(checkpoint_path) + self.model.to(self.device) + + def set_device(self) -> None: + """Set device to GPU/CPU""" + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def load_materials(self): + """Load useful materials used in generation""" + # model attributes + """ + finish_inform + how strict to finish an informable slot in goal: "strict" or "loose" + if "strict": the attribute are finished (removed from goal) only if both slot and value are produced in act + if "loose": the attribute are finished if the slot is produced + """ + self.finish_inform = ( + self.config.model.goal_update.finish_inform + ) # controls how strict to eliminate informed slots in goal + assert self.finish_inform in ["strict", "loose"] + + # constants + self.bos_id, _, self.pad_id, self.sep_id = self.tokenizer.convert_tokens_to_ids( + ["", "", "", ""] + ) + self.bin_flags = {"true": "_True_", "false": "_False_"} + + self.supported_services = ["train", "attraction", "hotel", "restaurant", "taxi", "police", "hospital"] + + self.slot_types = {"search": "search", "book": "book"} + # important to change the corresponding act str name when using different tokenization methods, + # as they are used to control the user behaviours + self.const_act_str = { + "inform": "inform", + "recommend": "recommend", + "request": "request", + "fail_search": "no offer", + "fail_book": "no book", + } + + def prepare_input_ids(self, data: dict, start_token: str) -> str: + assert start_token in ["", ""] + input_seq = "" + for key in [ + "CTX", + "SYS_UTT", + "SYS_ACT", + "SNT", + "RA", + "GC", + "GOAL", + ]: # fixed order, consistent between training and inference + if key not in data: + continue + wrap = wrap_element(key, data[key]) + input_seq = add_str(input_seq, wrap) + + input_seq = add_str(input_seq, start_token) + if transformers.__version__.startswith("2."): # compatible with transformers v2.x used in convlab2 + input_ids = self.tokenizer.encode(input_seq) + else: + input_ids = self.tokenizer(input_seq)["input_ids"] # convert to ids + input_ids = torch.tensor([input_ids]).long().to(self.device) + return input_ids + + def update_internal_data(self, data: dict) -> None: + """Maintain context and user act in the format of generation string for the next turn generation""" + # update context + sys_utt_wrap = wrap_element("SYS", data["SYS_UTT"]) # e.g., Which area would you prefer? + usr_utt_wrap = wrap_element("USR", data["USR_UTT"]) # e.g., I want to be in the centre. + self._context_str = add_str(self._context_str, sys_utt_wrap) + self._context_str = add_str(self._context_str, usr_utt_wrap) + + # update prev usr act + self._prev_usr_act = data["USR_ACT"] # e.g., inform area centre + + def run_inference_once(self, input_ids: torch.tensor, eos_id: int) -> List: + if transformers.__version__.startswith("2."): # compatible with transformers v2.x used in convlab2 + output = generation_func(self.model, input_ids, eos_id, self.config.decode.dec_max_len) + else: + output = self.model.generate( + input_ids, + max_length=self.config.decode.dec_max_len, + do_sample=self.config.decode.do_sample, + early_stopping=True, + temperature=self.config.decode.temperature, + use_cache=True, + num_beams=self.config.decode.num_beams, + bos_token_id=self.bos_id, + eos_token_id=eos_id, + pad_token_id=self.pad_id, + ) + return output + + def generate_whole_sequence(self, sys_utt: str) -> tuple: + # first forward pass: generate NLU output and three special flags ##### + data = {"CTX": self._context_str, "SYS_UTT": sys_utt} + start_token, end_token = "", "" + input_ids = self.prepare_input_ids(data, start_token) + eos_id = self.tokenizer.convert_tokens_to_ids(end_token) + output = self.run_inference_once(input_ids, eos_id) + generation = self.tokenizer.decode(output[0]) # decode back to str, including the fed context + + # parse first pass prediction + for key in ["SYS_ACT", "SNT", "GC", "RA"]: + value = find_segment(generation, key) + data[key] = value + + # update dynamic goal + if self.print_intermediary_info: + print("SYS ACT ->", data["SYS_ACT"]) + goal = self.prepare_turn_goal(self._prev_usr_act, data["SYS_ACT"], data["SNT"], data["GC"], data["RA"]) + data["GOAL"] = goal + + # second forward pass: generate dialogue act and NLG output ##### + start_token, end_token = "", "" + input_ids = self.prepare_input_ids(data, start_token) + eos_id = self.tokenizer.convert_tokens_to_ids(end_token) + output = self.run_inference_once(input_ids, eos_id) + generation = self.tokenizer.decode(output[0]) # decode back to str, including the fed context + + # parse second pass prediction + for key in ["USR_ACT", "USR_UTT"]: + value = find_segment(generation, key) + data[key] = value + return data, generation + + def _format_complete_goal(self, input_goal: dict) -> dict: + """Format the internal goal representation given a goal + + :param input_goal: a goal that the user has in mind + either from the corpus or sampled randomly in a valid way (e.g., correct slot names) + :returns: complete_goal: an internal representation of the given goal, a dict with the keys "intents", + "constraints" + intents: list[str], list of intents in the dialogue, aka scenario + constraints: dict, intent as key, in the following format + dict(intent: intent_constraints) + intent_constraints: {"informable": dict(slot: value_list), "requestable": slot_set} + each slot has a value list in case of failure of searching + """ + # TODO: make the order of services more flexible (how does convlab2 decide the service order?) + constraints = dict() + intents = [] + self.n_max_value = { + self.slot_types["book"]: 0, + self.slot_types["search"]: 0, + } # record the max length of value list of a slot + + for service in input_goal["ordered_services"]: + if service not in self.supported_services: + continue + + # record intent list (scenario), order matters + intent = self._map_service_to_intent(service) + assert intent not in intents and intent not in constraints + intents.append(intent) + constraints[intent] = {"informable": dict(), "requestable": set()} + + # collect informable slots + assert "info" in input_goal[service] # info has to exist + for key in ["fail_info", "info", "fail_book", "book"]: # order matters + # assert key in input_goal[service] + if key not in input_goal[service]: + continue + for slot, value in input_goal[service][key].items(): + self._add_info(constraints[intent]["informable"], slot, value) + + # collect requestable slots + key = "reqt" + # assert key in input_goal[service] + # for slot in input_goal[service][key]: + if key in input_goal[service]: + for slot in input_goal[service][key].keys(): + self._add_reqt(constraints[intent]["requestable"], slot) + + # order intents by the order they are dealt with in the data so + # if using ground truth system responses the right order of the intents + # is preserved + + complete_goal = {"intents": intents, "constraints": constraints} + return complete_goal + + def _init_user_status(self) -> dict: + """Initialise user status with intent and constraint + intent_idx: int, the index of current intent + constraint_idx: dict, intent as key, value is the constraint index used to record which value is used + in the slot value list + :return: + """ + intent_idx = 0 # -1 + # constraint_idx = {intent: 0 for intent in self.complete_goal["intents"]} + constraint_idx = { + intent: {self.slot_types["search"]: 0, self.slot_types["book"]: 0} + for intent in self.complete_goal["intents"] + } + # TODO: entity provide records, one of the criteria to move to the next intents + entity_provided = {intent: False for intent in self.complete_goal["intents"]} + return { + "intent_idx": intent_idx, + "constraint_idx": constraint_idx, + "dialogue_terminate": False, + "entity_provided": entity_provided, + } + + def _get_scenario_str(self) -> None: + """Get a scenario str from a intent list + + Description + convert a list of intents, aka scenario, into string with special marks + the scenario is determined at the start of dialogue and static during interaction + """ + intents = self.complete_goal["intents"] + _str = [wrap_element("INTENT", intent) for intent in intents] + _str = " ".join(_str) + self.scenario_str = wrap_element("SCENARIO", _str) + + def _prepare_current_constraints( + self, + involved_intents: List[str], + involved_slot_types: List[str], + if_reset_reqt: bool, + ) -> None: + """Prepare the current constraints, copied the specified content from the complete goal + + the current constraints is used as condition in the model generation + its content comes from the "constraints" in "complete goal", + but the current constraints only allows one value for a slot at a time + the value is chosen from the value list by the "constraint_idx" in user status + + :param involved_intents: list[str], intent list + :return: + current_constraints: dict, similar format as constraints in the complete goal, + but a slot has only one value, e.g., + dict(intent: intent_constraints) + intent_constraints: {"informable": dict(slot: value), "requestable": slot_set} + """ + # iterate the involved intents + for intent in involved_intents: + constraints = {"informable": dict(), "requestable": set()} + # informable slots value pairs + for slot, value_list in self.complete_goal["constraints"][intent]["informable"].items(): + slot_type = self._get_slot_type(slot) + if slot_type not in involved_slot_types: + continue + value_idx = self.user_status["constraint_idx"][intent][slot_type] + if value_idx < len(value_list): + value = value_list[value_idx] + constraints["informable"][slot] = value + + # requestable + if if_reset_reqt: + constraints["requestable"] = copy.deepcopy(self.complete_goal["constraints"][intent]["requestable"]) + else: + constraints["requestable"] = copy.deepcopy(self.current_constraints[intent]["requestable"]) + + # overwrite intent constraints + self.current_constraints[intent] = constraints + + @staticmethod + def _map_intent_to_service(intent: str) -> str: + # TODO: make it not dataset dependent? + """map an intent into a service, multiwoz only""" + return intent.split()[1] + + @staticmethod + def _map_service_to_intent(service: str) -> str: + # TODO: make it not dataset dependent? + """map a service into an intent, multiwoz only""" + return f"find {service}" + + def _get_slot_type(self, slot: str) -> str: + """return search or book type of a slot""" + slot_type = "book" if "book" in slot else "search" + assert slot_type in self.slot_types.keys() + return slot_type + + def _get_goal_str(self, intent: str) -> str: + """prepare the proper goal sequence, same as used in training""" + goal_str = "" + # dialogue scenario + goal_str = add_str(goal_str, self.scenario_str) + + # current task + goal_str = add_str(goal_str, wrap_element("TASK", intent)) + + # task description + service = self._map_intent_to_service(intent) + description = self.service2meta[service]["intents"][intent]["description"] + goal_str = add_str(goal_str, wrap_element("DESC", description)) + + # intent_constraints = self.dynamic_constraints[intent] + intent_constraints = self.current_constraints[intent] + # informable slots + info_str = "" + # for slot, value in intent_constraints["informable"].items(): + for slot in sorted(intent_constraints["informable"].keys()): # sort by slot + value = intent_constraints["informable"][slot] + info_str = add_str(info_str, wrap_element("SLOT", slot)) + info_str = add_str(info_str, wrap_element("VALUE", value)) + goal_str = add_str(goal_str, wrap_element("INFORM", info_str)) + + # requestable slots + req_str = "" + for slot in sorted(list(intent_constraints["requestable"])): + req_str = add_str(req_str, wrap_element("SLOT", slot)) + goal_str = add_str(goal_str, wrap_element("REQUEST", req_str)) + return goal_str.strip() + + def _start_new_intent(self, SNT_flag: str) -> bool: + """decide whether to start a new intent""" + # SNT (start new task) is predicted as on + assert SNT_flag in list(self.bin_flags.values()) + # intent = self.intents[self.intent_idx] + intent = self.complete_goal["intents"][self.user_status["intent_idx"]] + + # TODO: need at least an entity provided (not really sure... + # if not self.intent_entity_provided[intent]: # no entities provided in the intent yet + # return False + + # TODO: think about the priority of SNT prediction. It's should be less prioritised than + # the number of left constraints. + # if SNT_flag == self.bin_flags["true"]: # model prediction in first turn is true + # return True + + # current intent has empty constraints + if ( + len(self.current_constraints[intent]["informable"]) == 0 + and len(self.current_constraints[intent]["requestable"]) == 0 + ): + return True + return False + + def _check_entity_provided(self, sys_act, intent): + # TODO: + """Check if an entity provided in system response (act)""" + assert intent in [ + "find restaurant", + "find hotel", + "find attraction", + "find train", + "find taxi", + "find police", + "find hospital", + ] + if intent in ["find restaurant", "find hotel", "find attraction"]: + if " name " in sys_act: + self.intent_entity_provided[intent] = True + elif intent == "find train": + if " train id " in sys_act: + self.intent_entity_provided[intent] = True + else: # taxi + if " type " in sys_act: + self.intent_entity_provided[intent] = True + + def _activate_dialogue_terminate(self) -> None: + """Turn on the user status about dialogue termination""" + self.user_status["dialogue_terminate"] = True + + def prepare_turn_goal(self, prev_usr_act: str, sys_act: str, SNT_flag: str, GC_flag: str, RA_flag: str) -> str: + """prepare the goal sequence for the current turn""" + # TODO: more detailed instruction here + # TODO: Deal with empty intents (and figure out why they happen) + intent = self.complete_goal["intents"][self.user_status["intent_idx"]] + + # TODO: check if at least one entity is provided in system act + # First thing to do, check if the system provides an entity + # self._check_entity_provided(sys_act, intent) + + # update goal first then check if moves to next intent (task) + self._update_current_constraints(intent, "usr", prev_usr_act, sys_act) + self._update_current_constraints( + intent, "sys", prev_usr_act, sys_act + ) # impact of sys_act overwrites that of usr_act + + # check if new intent starts + if self._start_new_intent(SNT_flag): + self.user_status["intent_idx"] += 1 + if self.user_status["intent_idx"] < len(self.complete_goal["intents"]): + intent = self.complete_goal["intents"][self.user_status["intent_idx"]] + else: + self._activate_dialogue_terminate() + # TODO: request alternative by setting for sgd + # TODO: sample new goal if goal change for sgd + goal_str = self._get_goal_str(intent) + + # print("***** user status *****\n->", self.user_status, "\n") + # print("***** current intent *****\n->", intent, "\n") # BACK + # print("***** current intent constraint *****\n->", self.current_constraints[intent], "\n") + # print("***** corresponding goal str *****\n->", goal_str, "\n") + # print("***** current entities provided (empty) *****\n->", self.intent_entity_provided, "\n") + return goal_str + + def _use_next_constraints(self, intent: str, slot_type: str) -> None: + """move the constraint pointer to the next""" + # Another problem is that how to decide which slot type (search or book) to add when failure? + # one solution is that dont use act mapping to keep NoOffer and NoBook separate, if so, try use nl on act + self.user_status["constraint_idx"][intent][slot_type] += 1 + if self.user_status["constraint_idx"][intent][slot_type] >= self.n_max_value[slot_type]: + # TODO: ask Alex, usually how to deal with this warning case? And make it as warning rather than just print + print( + f"Failure times on {slot_type} is more than the given value candidates, \ + no new value to choose as alternative" + ) + print("A valid goal should not enter here!") + self.user_status["constraint_idx"][intent][slot_type] = ( + self.n_max_value[slot_type] - 1 + ) # let user use last values as they are supposed to be fine + + def _update_current_constraints(self, intent: str, spk: str, usr_act: str, sys_act: str) -> None: + # TODO: complete instruction here + """Update current constraints used for generation based on either previous usr or sys act + + :param act: + :param spk: + :param intent: + :return: + """ + assert spk in ["usr", "sys"] + # act_dict = parse_act(act) + intent_constraints = self.current_constraints[intent] + + if spk == "sys": + act_dict = self.parse_act(sys_act, self.print_intermediary_info) + # When the system provides information (in the act_dict) then remove it from the user's + # requestable constraints as the user has been provided with the info! + # NB: This was added by Alistair after the original code was shared by Andy, + # as it seems the original implementation missed this critical step. + if self.const_act_str["inform"] in act_dict: + for slot in act_dict[self.const_act_str["inform"]]: + if slot in intent_constraints["requestable"]: + intent_constraints["requestable"].remove(slot) + elif self.const_act_str["recommend"] in act_dict: + for slot in act_dict[self.const_act_str["recommend"]]: + if slot in intent_constraints["requestable"]: + intent_constraints["requestable"].remove(slot) + + # when the system informs failure (search or book), use next set of constraints given in goal ##### + # if "_NOTIFY_FAILURE_" in act_dict: + if self.const_act_str["fail_search"] in act_dict: + slot_type = self.slot_types["search"] + self._use_next_constraints(intent, slot_type) + keep_slot_types = [ + self.slot_types["search"], + self.slot_types["book"], + ] # still in search phase, book slots should be kept + self._prepare_current_constraints( + [intent], keep_slot_types, if_reset_reqt=False + ) # only change constraints for this intent + + elif self.const_act_str["fail_book"] in act_dict: + slot_type = self.slot_types["book"] + self._use_next_constraints(intent, slot_type) + keep_slot_types = [self.slot_types["book"]] # already found entities, no need to keep search slots + self._prepare_current_constraints([intent], keep_slot_types, if_reset_reqt=False) + + # when the system request # + elif self.const_act_str["request"] in act_dict: + requested_slots = act_dict[self.const_act_str["request"]] + for slot in requested_slots.keys(): + # requested slot in current constraint, do nothing + if slot in intent_constraints["informable"].keys(): + continue + + # slots that are beyond the current goal enter the following section + # case 1: requested slot in the complete goal, + # this should be entered if the system requests the informed slots + # if slot in self.complete_constraints[intent]["informable"].keys(): + # value = self.complete_constraints[intent]["informable"][slot] + if ( + slot in self.complete_goal["constraints"][intent]["informable"].keys() + ): # dict of slot to value_list + slot_type = self._get_slot_type(slot) + value_idx = self.user_status["constraint_idx"][intent][slot_type] + value = self.complete_goal["constraints"][intent]["informable"][slot][value_idx] + + # case 2: requested slot not in the complete goal, set the value to "dontcare" + # can sample a new value here for more interesting interactions + else: + value = "dontcare" # "no preference" # TODO: play around to see nlg output + intent_constraints["informable"][slot] = value + + else: # usr + act_dict = self.parse_act(usr_act, self.print_intermediary_info) + # remove informed slot/value pair, if informed # + if self.const_act_str["inform"] in act_dict: + for slot, value_list in act_dict[self.const_act_str["inform"]].items(): + # value = value_list[0] + for value in value_list: # possible to have multi-value slots in user act in corpus + if self.finish_inform == "loose" and slot in intent_constraints["informable"]: + del intent_constraints["informable"][slot] + if ( + self.finish_inform == "strict" + and slot in intent_constraints["informable"] + and value == intent_constraints["informable"][slot] + ): + del intent_constraints["informable"][slot] + + # remove requested slot, if requested + if self.const_act_str["request"] in act_dict: + sys_act_dict = self.parse_act(sys_act, self.print_intermediary_info) # auxiliary check + for slot in act_dict[self.const_act_str["request"]].keys(): + # if slot in intent_constraints["requestable"]: # one choice + if self.const_act_str["inform"] in sys_act_dict: + if ( + slot in intent_constraints["requestable"] + and slot in sys_act_dict[self.const_act_str["inform"]].keys() + ): # another choice, more strict + intent_constraints["requestable"].remove(slot) + + def _add_info(self, slot_to_value_list, slot, value) -> None: + # print(slot) + # assert slot in self.schema_slots # SLOT_FORMAT + # constraints[intent]["informable"][slot] = value + if slot not in slot_to_value_list: + slot_to_value_list[slot] = [] + # assert value not in slot_to_value_list[slot] + if value not in slot_to_value_list[slot]: + slot_to_value_list[slot].append(value) + slot_type = self._get_slot_type(slot) + if len(slot_to_value_list) > self.n_max_value[slot_type]: + self.n_max_value[slot_type] = len(slot_to_value_list) + + def _add_reqt(self, slot_set, slot) -> None: + # assert slot in self.schema_slots # SLOT_FORMAT + # constraints[intent]["requestable"].add(slot) + slot_set.add(slot) + + def _validate_input_goal(self): + """validate the input goal""" + # TODO: finish the method + # assert all([intent in self.schema_intents for intent in intents]) # ensure intents are in schema + pass + + @staticmethod + def parse_act(act_seq: str, print_intermediary_info: bool) -> dict: + """parse usr/sys act string into dict(act: {slot=value_list}) (slots in act_request have '_Empty_' value)""" + act_dict = {} + assert isinstance(act_seq, str) + act_seq = act_seq.split("") + for act_seg in act_seq: + if act_seg == "": + continue + + act_seg = act_seg.strip() # remove space at the start/end + act_seg = act_seg.split() + # get act in special token format # + # act = act_seg[0] # e.g., _INFORM_, _REQUEST_ + # assert act[0] == "_" and act[-1] == "_" + # act_seg = " ".join(act_seg[2:]) # discard first two tokens, "_ACT_ " + + # get act in natural language format # + end_idx = act_seg.index("") + act = " ".join(act_seg[:end_idx]) + act_seg = " ".join(act_seg[end_idx + 1 :]) # act arguments (slot/value pairs) + # print(f"act: {act}\n", act_seg, "\n") + + assert act not in act_dict + act_dict[act] = {} + + # Sometimes the model bugs out and puts or where there should be or + if "ACT" in act_seg: + continue + + for sv_seg in act_seg.split(""): + if sv_seg == "": + continue + + try: + sv_seg = sv_seg.replace("", "") + sv_seg = sv_seg.strip() # remove spaces at begin and end + # print("|{}|".format(sv_seg)) + slot, value = sv_seg.split(" ") + slot, value = slot.strip(), value.strip() + # print("act: |{}|, slot: |{}|, value: |{}|".format(act, slot, value)) + # one slot one value + # act_dict[act][slot] = value + # one slot, multi-value is possible by system + if slot not in act_dict[act]: + act_dict[act][slot] = [] + if value not in act_dict[act][slot]: + act_dict[act][slot].append(value) + + except Exception: + if print_intermediary_info: + print( + bcolors.YELLOW + + "!The User Agent got messed up the intermediate syntax! Exception:" + + bcolors.ENDC + ) + traceback.print_exc() + continue + + # print(act_dict) + return act_dict + + def convert_into_system_act_format(self): + # TODO + pass + + # below methods need be implemented for convlab-2 to work # + def init_session(self, **kwargs): + """Use this method to reset the agent state after each dialogue, if necessary. + This gets called before each dialogue. + + Examples + -------- + In `simulate_corpus_interaction.py` you will see that this is used, for example, to pass + the dialogue to the corpus agent so it knows what to talk about. + + An example here would be to reset the dialogue context. + """ + # dialogue goal in MultiWOZ2.1-like format + self.current_goal = kwargs.get("ini_goal", {}) + self.policy.init_session(ini_goal=self.current_goal) + self.current_goal = self.policy.get_goal() + # TODO: ANYTHING ELSE THAT NEEDS TO HAPPEN BEFORE EACH DIALOGUE? + self.context = [] + self.input_action = [] + self.output_action = [] + + # init internal data + self._context_str = "" # context string with special tags used in generation + self._prev_usr_act = "" # user act string used in generation + + # goal process + self.complete_goal = self._format_complete_goal(self.current_goal) + self.user_status = self._init_user_status() + self._get_scenario_str() + self.current_constraints = {} # init + self._prepare_current_constraints( + self.complete_goal["intents"], + list(self.slot_types.keys()), + if_reset_reqt=True, + ) + + # print("input goal:\n", self.current_goal, "\n") + # print("complete goal:\n", self.complete_goal, "\n") + # print("current constraints:\n", self.current_constraints, "\n") + # sys.exit(1) + + def response(self, sys_utterance: str) -> str: + """Generate natural language response given the system response. + + Parameters + --------- + sys_utterance + Last system utterance. For first turn, sys_utterance is the empty string. + + Returns + ------- + response + A natural language response. + + """ + + # TODO: MAKE APPROPRIATE USE OF THE HISTORY, BEHAVIOUR_PARAMS, CURRENT_GOAL, UPDATE_GOAL TO GENERATE A RESPONSE + # TODO: DON'T FORGET TO UPDATE INPUT AND OUTPUT ACTIONS STATES + # response = "I want Italian." + gen_parse, gen_str = self.generate_whole_sequence(sys_utterance) + self.update_internal_data(gen_parse) # prepare for next turn + if self.print_intermediary_info: + segment_gen(gen_str, "example dialogue") # crazyusermodel + # TODO: update lists of context, da_in, da_out here + return gen_parse["USR_UTT"] + + def get_in_da(self) -> List[List[str]]: + """Used by clients to retrieve the user model NLU. + + Returns + ------- + NLU output, assumed to be a list of lists, each formatted as:: + + [[intention, domain, slot, value], ...] + + Here ``intention`` refers to a dialogue act and the ``intention``, ``domain`` and ``slot`` strings should + follow the same convention as the corpus dialogue act annotations (i.e., capitalised, and using the correct + set of slot names). + """ + return self.input_action + + def get_out_da(self) -> List[List[str]]: + """Used by clients to retrieve the user model policy output. + + Returns + ------- + Policy output, following the same convention as the NLU output. + """ + return self.output_action + + def get_reward(self) -> float: + """Dummy method, used for API consistency.""" + return -1 + + def is_terminated(self) -> bool: + """This should tell an external client whether the user model considers they have completed the task.""" + # return False + return self.user_status["dialogue_terminate"] + + +def parse_complete_gen(gen): + """parse the complete generation output, return predictions of system act, user act and user utterance""" + output = {} + for key in ["SYS_ACT", "SNT", "GC", "RA", "USR_ACT", "USR_UTT"]: + value = find_segment(gen, key) + output[key] = value + # print("***** complete generation output *****\n->", gen, "\n") # BACK + # print("***** parse output *****\n->", output, "\n") + return output + + +def generate_example_goal() -> dict: + """create an example goal for testing""" + # {service: service_meta}, + # service_mate: {"info": {slot: value}, "fail_info": {slot: value}, + # "book": {slot}: value, "fail_book": {slot: value}, "reqt": set(slot)} + goal = {} + services = ["restaurant", "hotel"] + # services = ["train", "attraction"] + # services = ["restaurant"] + + # # restaurant + service = services[0] + goal[service] = {} + goal[service]["fail_info"] = {"food": "eastern european", "area": "south", "price range": "expensive"} + goal[service]["info"] = {"food": "chinese", "area": "south", "price range": "cheap"} + goal[service]["fail_book"] = {} + goal[service]["book"] = {"book day": "monday", "book people": "8", "book time": "13:15"} + goal[service]["reqt"] = {"address": "?"} + + # hotel + service = services[1] + goal[service] = {} + goal[service]["fail_info"] = {"stars": "3", "price range": "cheap", "area": "centre", "internet": "_True_"} + goal[service]["info"] = {"stars": "5", "price range": "expensive", "area": "centre", "internet": "_True_"} + goal[service]["fail_book"] = {"book day": "sunday", "book stay": 3, "book people": 2} + goal[service]["book"] = {"book day": "monday", "book stay": 1, "book people": 2} + goal[service]["reqt"] = {"phone": "?", "postcode": "?"} + + # # train + # service = services[1] + # goal[service] = {} + # goal[service]["info"] = { + # "destination": "ely", + # "day": "monday", + # "arrive by": "19:00", + # "departure": "cambridge", + # "book people": "8" + # } + # goal[service]["reqt"] = {"duration": "?", "leave at": "?", "train id": "?"} + + # # attraction + # service = services[1] + # goal[service] = {} + # goal[service]["info"] = { + # "type": "college", + # "area": "west" + # } + # goal[service]["reqt"] = {"phone": "?", "postcode": "?"} + + # taxi + # service = services[0] + # goal[service] = {} + # goal[service]["info"] = { + # "arrive by": "17:30", + # "departure": "city stop restaurant", + # "destination": "the cambridge punter" + # } + # goal[service]["reqt"] = {"phone": "?", "type": "?"} + # more services... + return goal + + +def set_sorted_services_for_current_goal(goal, goal_idx, df_raw_mwoz): + # Get the list of services in the goal as they appear in the data so they can be processed correctly + + current_dialogue_services = [] + for service_name in goal: + current_dialogue_services.append(service_name) + + message = df_raw_mwoz.iloc[:, goal_idx].goal["message"] + + ordered_current_dialogue_services = [] + + for instruction in message: + instruction_split = re.split(" |<|>", instruction) + for word in instruction_split: + if word in current_dialogue_services: + ordered_current_dialogue_services.append(word) + current_dialogue_services.remove(word) + + # Make sure any words not mentioned in the message (e.g. it happens for 'police' in the second goal) are n#ot missed + for word in current_dialogue_services: + if word not in ordered_current_dialogue_services: + ordered_current_dialogue_services.append(word) + + return ordered_current_dialogue_services + + +def read_multiWOZ_20_goals(file_path, n_goals): + df_raw_mwoz = pd.read_json(file_path) + + goals = [] + for i in range(n_goals): + parsed_goal = {} + goal = df_raw_mwoz.iloc[:, i].goal + + # Determine relevant keys + for _ in goal.keys(): + relevant_goals = {k: v for k, v in goal.items() if v != {} and k != "topic" and k != "message"} + services = [key for key in relevant_goals.keys()] + for service in services: + parsed_goal[service] = relevant_goals[service] + + ordered_services = set_sorted_services_for_current_goal(parsed_goal, i, df_raw_mwoz) + parsed_goal["ordered_services"] = ordered_services + + # Update the format of those relevant keys to match the format of this code + for service in parsed_goal.keys(): + if service == "ordered_services": + continue + + for service_key, service_value in parsed_goal[service].items(): + + # Handle 'reqt' key which is a list. Convert it to a dict. (and do the same for similar keys). + if type(parsed_goal[service][service_key]) is list and parsed_goal[service][service_key] != []: + replacement_dict = {} + for item in parsed_goal[service][service_key]: + replacement_dict[item] = "?" + parsed_goal[service][service_key] = replacement_dict + + # Handle 'hotel' key which has a string value + # with the name of hotel - or other similar situations + elif type(parsed_goal[service][service_key]) is str: + continue + + # Make sure the dictionary we are adding is not empty + if not parsed_goal[service][service_key]: + continue + + # Remove any attributes that are "invalid" or "preinvalid" + # Also check if 'arriveBy' or 'leaveAt' or 'pricerange' is inside the attributes of the service_key + # If so reformat it according to the code in this file + + list_of_attribute_keys = [k for k in parsed_goal[service][service_key].keys()] + for k in list_of_attribute_keys: + if k == "invalid" or k == "pre_invalid": + parsed_goal[service][service_key].pop(k) + if k == "arriveBy": + parsed_goal[service][service_key]["arrive by"] = parsed_goal[service][service_key].pop(k) + elif k == "leaveAt": + parsed_goal[service][service_key]["leave at"] = parsed_goal[service][service_key].pop(k) + elif k == "pricerange": + parsed_goal[service][service_key]["price range"] = parsed_goal[service][service_key].pop(k) + elif k == "car type": + parsed_goal[service][service_key]["type"] = parsed_goal[service][service_key].pop(k) + elif k == "trainID": + parsed_goal[service][service_key]["train id"] = parsed_goal[service][service_key].pop(k) + + # Check if "book" is in the service info dict ("book" or "fail_book") then prepend + # 'book' to the keys inside the service (the attributes of the service) + if "book" in service_key: + list_of_attribute_keys = [k for k in parsed_goal[service][service_key].keys()] + for k in list_of_attribute_keys: + parsed_goal[service][service_key]["book {}".format(k)] = parsed_goal[service][service_key].pop( + k + ) + + # If True or False is in service.values, convert to "_True_" or "_False_" + for k, v in parsed_goal[service][service_key].items(): + if v is True: + parsed_goal[service][service_key][k] = "_True_" + elif v is False: + parsed_goal[service][service_key][k] = "_False_" + + goals.append(parsed_goal) + + return goals + + +def interact(checkpoint_path): + user_model = NeuralAgent("user", checkpoint_path, "scripts/user_model_code/interaction/config.yaml") + + # TODO: fix the hardcoded variables here + file_path = "data/raw/UBAR/multi-woz/data.json" + user_model.print_intermediary_info = True + n_goals = 50 + + for dialogue_number, goal in enumerate(read_multiWOZ_20_goals(file_path, n_goals)): + try: + # goal = generate_example_goal() + user_model.init_session(ini_goal=goal) + sys_utt = "" + + for turn_id in range(100): + user_model.response(sys_utt) + + if user_model.is_terminated(): + print("Dialogue terminates!") + break + + # next turn materials + sys_utt = input("Enter system response here: ") + if sys_utt == "Goodbye": + break + + except Exception: + print("Error in dialogue {}".format(dialogue_number)) + traceback.print_exc() + continue + + +if __name__ == "__main__": + if len(sys.argv) == 1: + print("Wrong argument!") + print("Usage: python multiwoz_interact.py checkpoint_path") + sys.exit(1) + + checkpoint_path = sys.argv[1] + interact(checkpoint_path) diff --git a/scripts/user_model_code/interaction/schema.json b/scripts/user_model_code/interaction/schema.json new file mode 100644 index 0000000000000000000000000000000000000000..213881a6deb1d4aa7226f2f7969ad535e8aeb445 --- /dev/null +++ b/scripts/user_model_code/interaction/schema.json @@ -0,0 +1,712 @@ +[ + { + "service_name": "hotel", + "slots": [ + { + "name": "hotel-pricerange", + "description": "price budget of the hotel", + "possible_values": [ + "expensive", + "cheap", + "moderate" + ], + "is_categorical": true + }, + { + "name": "hotel-type", + "description": "what is the type of the hotel", + "possible_values": [ + "guesthouse", + "hotel" + ], + "is_categorical": true + }, + { + "name": "hotel-parking", + "description": "whether the hotel has parking", + "possible_values": [ + "free", + "no", + "yes" + ], + "is_categorical": true + }, + { + "name": "hotel-bookday", + "description": "day of the hotel booking", + "possible_values": [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday" + ], + "is_categorical": true + }, + { + "name": "hotel-bookpeople", + "description": "number of people for the hotel booking", + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8" + ], + "is_categorical": true + }, + { + "name": "hotel-bookstay", + "description": "length of stay at the hotel", + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8" + ], + "is_categorical": true + }, + { + "name": "hotel-stars", + "description": "star rating of the hotel", + "possible_values": [ + "0", + "1", + "2", + "3", + "4", + "5" + ], + "is_categorical": true + }, + { + "name": "hotel-internet", + "description": "whether the hotel has internet", + "possible_values": [ + "free", + "no", + "yes" + ], + "is_categorical": true + }, + { + "name": "hotel-name", + "description": "name of the hotel", + "possible_values": [], + "is_categorical": false + }, + { + "name": "hotel-area", + "description": "area or place of the hotel", + "possible_values": [ + "centre", + "east", + "north", + "south", + "west" + ], + "is_categorical": true + }, + { + "name": "hotel-address", + "description": "address of the hotel", + "is_categorical": false + }, + { + "name": "hotel-phone", + "description": "phone number of the hotel", + "is_categorical": false + }, + { + "name": "hotel-postcode", + "description": "postal code of the hotel", + "is_categorical": false + }, + { + "name": "hotel-ref", + "description": "reference number of the hotel booking", + "is_categorical": false + } + ], + "description": "hotel reservations and vacation stays", + "intents": [ + { + "name": "find_hotel", + "description": "search for a hotel to stay in", + "is_transactional": false, + "required_slots": [], + "optional_slots": { + "hotel-pricerange": "dontcare", + "hotel-type": "dontcare", + "hotel-parking": "dontcare", + "hotel-bookday": "dontcare", + "hotel-bookpeople": "dontcare", + "hotel-bookstay": "dontcare", + "hotel-stars": "dontcare", + "hotel-internet": "dontcare", + "hotel-name": "dontcare", + "hotel-area": "dontcare" + } + }, + { + "name": "book_hotel", + "description": "book a hotel to stay in", + "is_transactional": true, + "required_slots": [], + "optional_slots": { + "hotel-pricerange": "dontcare", + "hotel-type": "dontcare", + "hotel-parking": "dontcare", + "hotel-bookday": "dontcare", + "hotel-bookpeople": "dontcare", + "hotel-bookstay": "dontcare", + "hotel-stars": "dontcare", + "hotel-internet": "dontcare", + "hotel-name": "dontcare", + "hotel-area": "dontcare" + } + } + ] + }, + { + "service_name": "train", + "slots": [ + { + "name": "train-arriveby", + "description": "arrival time of the train", + "possible_values": [], + "is_categorical": false + }, + { + "name": "train-departure", + "description": "departure location of the train", + "possible_values": [ + "birmingham new street", + "bishops stortford", + "broxbourne", + "cambridge", + "ely", + "kings lynn", + "leicester", + "london kings cross", + "london liverpool street", + "norwich", + "peterborough", + "stansted airport", + "stevenage" + ], + "is_categorical": true + }, + { + "name": "train-day", + "description": "day of the train", + "possible_values": [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday" + ], + "is_categorical": true + }, + { + "name": "train-bookpeople", + "description": "how many train tickets you need", + "possible_values": [ + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "15" + ], + "is_categorical": true + }, + { + "name": "train-leaveat", + "description": "leaving time for the train", + "possible_values": [], + "is_categorical": false + }, + { + "name": "train-destination", + "description": "destination of the train", + "possible_values": [ + "birmingham new street", + "bishops stortford", + "broxbourne", + "cambridge", + "ely", + "kings lynn", + "leicester", + "london kings cross", + "london liverpool street", + "norwich", + "peterborough", + "stansted airport", + "stevenage" + ], + "is_categorical": true + }, + { + "name": "train-trainid", + "description": "id of the train", + "is_categorical": false + }, + { + "name": "train-ref", + "description": "reference number of the train booking", + "is_categorical": false + }, + { + "name": "train-price", + "description": "price of the train", + "is_categorical": false + }, + { + "name": "train-duration", + "description": "duration of the travel", + "is_categorical": false + } + ], + "description": "find trains that take you to places", + "intents": [ + { + "name": "find_train", + "description": "search for trains that take you places", + "is_transactional": false, + "required_slots": [], + "optional_slots": { + "train-destination": "dontcare", + "train-arriveby": "dontcare", + "train-departure": "dontcare", + "train-day": "dontcare", + "train-bookpeople": "dontcare", + "train-leaveat": "dontcare" + } + }, + { + "name": "book_train", + "description": "book train tickets", + "is_transactional": true, + "required_slots": [], + "optional_slots": { + "train-destination": "dontcare", + "train-arriveby": "dontcare", + "train-departure": "dontcare", + "train-day": "dontcare", + "train-bookpeople": "dontcare", + "train-leaveat": "dontcare" + } + } + ] + }, + { + "service_name": "attraction", + "slots": [ + { + "name": "attraction-area", + "description": "area to search for attractions", + "possible_values": [ + "centre", + "east", + "north", + "south", + "west" + ], + "is_categorical": true + }, + { + "name": "attraction-name", + "description": "name of the attraction", + "possible_values": [], + "is_categorical": false + }, + { + "name": "attraction-type", + "description": "type of the attraction", + "possible_values": [ + "architecture", + "boat", + "cinema", + "college", + "concerthall", + "entertainment", + "museum", + "multiple sports", + "nightclub", + "park", + "swimmingpool", + "theatre" + ], + "is_categorical": true + }, + { + "name": "attraction-entrancefee", + "description": "how much is the entrance fee", + "is_categorical": false + }, + { + "name": "attraction-openhours", + "description": "open hours of the attraction", + "is_categorical": false + }, + { + "name": "attraction-address", + "description": "address of the attraction", + "is_categorical": false + }, + { + "name": "attraction-phone", + "description": "phone number of the attraction", + "is_categorical": false + }, + { + "name": "attraction-postcode", + "description": "postal code of the attraction", + "is_categorical": false + } + ], + "description": "find touristy stuff to do around you", + "intents": [ + { + "name": "find_attraction", + "description": "search for places to see for leisure", + "is_transactional": false, + "required_slots": [], + "optional_slots": { + "attraction-area": "dontcare", + "attraction-name": "dontcare", + "attraction-type": "dontcare" + } + } + ] + }, + { + "service_name": "restaurant", + "slots": [ + { + "name": "restaurant-pricerange", + "description": "price budget for the restaurant", + "possible_values": [ + "cheap", + "expensive", + "moderate" + ], + "is_categorical": true + }, + { + "name": "restaurant-area", + "description": "area or place of the restaurant", + "possible_values": [ + "centre", + "east", + "north", + "south", + "west" + ], + "is_categorical": true + }, + { + "name": "restaurant-food", + "description": "the cuisine of the restaurant you are looking for", + "is_categorical": false + }, + { + "name": "restaurant-name", + "description": "name of the restaurant", + "possible_values": [], + "is_categorical": false + }, + { + "name": "restaurant-bookday", + "description": "day of the restaurant booking", + "possible_values": [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday" + ], + "is_categorical": true + }, + { + "name": "restaurant-bookpeople", + "description": "how many people for the restaurant reservation", + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8" + ], + "is_categorical": true + }, + { + "name": "restaurant-booktime", + "description": "time of the restaurant booking", + "possible_values": [], + "is_categorical": false + }, + { + "name": "restaurant-address", + "description": "address of the restaurant", + "is_categorical": false + }, + { + "name": "restaurant-phone", + "description": "phone number of the restaurant", + "is_categorical": false + }, + { + "name": "restaurant-postcode", + "description": "postal code of the restaurant", + "is_categorical": false + }, + { + "name": "restaurant-ref", + "description": "reference number of the restaurant booking", + "is_categorical": false + } + ], + "description": "find places to dine and whet your appetite", + "intents": [ + { + "name": "find_restaurant", + "description": "search for places to wine and dine", + "is_transactional": false, + "required_slots": [], + "optional_slots": { + "restaurant-pricerange": "dontcare", + "restaurant-area": "dontcare", + "restaurant-food": "dontcare", + "restaurant-name": "dontcare", + "restaurant-bookday": "dontcare", + "restaurant-bookpeople": "dontcare", + "restaurant-booktime": "dontcare" + } + }, + { + "name": "book_restaurant", + "description": "book a table at a restaurant", + "is_transactional": true, + "required_slots": [], + "optional_slots": { + "restaurant-pricerange": "dontcare", + "restaurant-area": "dontcare", + "restaurant-food": "dontcare", + "restaurant-name": "dontcare", + "restaurant-bookday": "dontcare", + "restaurant-bookpeople": "dontcare", + "restaurant-booktime": "dontcare" + } + } + ] + }, + { + "service_name": "hospital", + "slots": [ + { + "name": "hospital-department", + "description": "type of medical care", + "possible_values": [], + "is_categorical": false + }, + { + "name": "hospital-address", + "description": "address of the hospital", + "is_categorical": false + }, + { + "name": "hospital-phone", + "description": "phone number of the hospital", + "is_categorical": false + }, + { + "name": "hospital-postcode", + "description": "postal code of the hospital", + "is_categorical": false + } + ], + "description": "making you feel better when you are ill", + "intents": [ + { + "name": "find_hospital", + "description": "search for a medical facility or a doctor", + "is_transactional": false, + "required_slots": [], + "optional_slots": { + "hospital-department": "dontcare" + } + } + ] + }, + { + "service_name": "taxi", + "slots": [ + { + "name": "taxi-leaveat", + "description": "leaving time of taxi", + "possible_values": [], + "is_categorical": false + }, + { + "name": "taxi-destination", + "description": "destination of taxi", + "possible_values": [], + "is_categorical": false + }, + { + "name": "taxi-departure", + "description": "departure location of taxi", + "possible_values": [], + "is_categorical": false + }, + { + "name": "taxi-arriveby", + "description": "arrival time of taxi", + "possible_values": [], + "is_categorical": false + }, + { + "name": "taxi-type", + "description": "car type of the taxi", + "is_categorical": false + }, + { + "name": "taxi-phone", + "description": "phone number of the taxi", + "is_categorical": false + } + ], + "description": "rent cheap cabs to avoid traffic", + "intents": [ + { + "name": "book_taxi", + "description": "book taxis to travel between places", + "is_transactional": true, + "required_slots": [], + "optional_slots": { + "taxi-leaveat": "dontcare", + "taxi-destination": "dontcare", + "taxi-departure": "dontcare", + "taxi-arriveby": "dontcare" + } + } + ] + }, + { + "service_name": "bus", + "slots": [ + { + "name": "bus-departure", + "description": "departure location of bus", + "possible_values": [ + "cambridge" + ], + "is_categorical": false + }, + { + "name": "bus-destination", + "description": "destination of bus", + "possible_values": [ + "london kings cross", + "bishops stortford", + "cambridge", + "kohinoor" + ], + "is_categorical": false + }, + { + "name": "bus-leaveat", + "description": "leaving time of bus", + "is_categorical": false + }, + { + "name": "bus-day", + "description": "day to use the bus tickets", + "possible_values": [ + "wednesday" + ], + "is_categorical": true + } + ], + "description": "bus service for traveling", + "intents": [ + { + "name": "find_bus", + "description": "search for a bus", + "is_transactional": false, + "required_slots": [], + "optional_slots": { + "bus-departure": "dontcare", + "bus-destination": "dontcare", + "bus-day": "dontcare", + "bus-leaveat": "dontcare" + } + } + ] + }, + { + "service_name": "police", + "slots": [ + { + "name": "police-address", + "description": "address of the police station", + "is_categorical": false + }, + { + "name": "police-phone", + "description": "phone number of the police station", + "is_categorical": false + }, + { + "name": "police-postcode", + "description": "postal code of the police station", + "is_categorical": false + }, + { + "name": "police-name", + "description": "name of the police station", + "possible_values": [ + "parkside police station" + ], + "is_categorical": true + } + ], + "description": "police station", + "intents": [ + { + "name": "police", + "description": "search for police station", + "is_transactional": false, + "required_slots": [], + "optional_slots": { + "police-name": "dontcare" + } + } + ] + } +] diff --git a/scripts/user_model_code/interaction/utils.py b/scripts/user_model_code/interaction/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..19ced3d04372ba30c60d591b14ebab3992881b8f --- /dev/null +++ b/scripts/user_model_code/interaction/utils.py @@ -0,0 +1,308 @@ +import json +import re + + +def segment_gen(gen, dial_id): + def _color(_segment): + if tag == "CTX": + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"USR: {bcolors.OKCYAN}") + _segment = _segment.replace(" ", f"SYS: {bcolors.OKBLUE}") + if tag == "SYS_UTT": + _segment = f"{bcolors.OKBLUE}" + _segment + f"{bcolors.ENDC}" + if tag == "USR_UTT": + _segment = f"{bcolors.OKCYAN}" + _segment + f"{bcolors.ENDC}" + if tag in ["SYS_ACT", "USR_ACT", "GOAL"]: + _segment = _segment.replace(" ", f"{bcolors.RED}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"{bcolors.YELLOW}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"{bcolors.GREEN}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + if tag == "GOAL": + _segment = _segment.replace( + "", f"{bcolors.UNDERLINE}" + ) + _segment = _segment.replace("", f"{bcolors.ENDC}") + _segment = _segment.replace("", f"{bcolors.UNDERLINE}") + _segment = _segment.replace("", f"{bcolors.ENDC}") + # if tag in ["SNT", "GC"]: + # segment = segment.replace("<{}/> ".format(tag), "<{}/> *".format(tag)) + # segment = segment.replace(" ".format(tag), "* <{}/>".format(tag)) + return _segment + + assert isinstance(gen, str) + # gen = gen.split() + # print(gen) + print("*** Dial_id: {} ***".format(dial_id)) + for tag in [ + "CTX", + "SYS_UTT", + "SYS_ACT", + "GOAL", + "SNT", + "RA", + "GC", + "USR_ACT", + "USR_UTT", + ]: + segment = find_segment(gen, tag) + if segment is not None: + print('{} -> "{}"'.format(tag, _color(segment))) + else: + print("Fail to find the segment...") + print("GEN:", gen) + print("---" * 30) + + +# input("press...") + + +def get_original_act_set(): + # full act vocab: + # https://github.com/ConvLab/ConvLab/blob/master/data/multiwoz/annotation/Multiwoz%20data%20analysis.md#dialog-act + acts = set() + acts.add("Inform") + acts.add("Request") + acts.add( + "NoOffer" + ) # equivalent to the concept of `no matching`, `cannot find` in database + acts.add("Recommend") + acts.add("Select") + acts.add( + "OfferBook" + ) # only for `train` domain, ask if book is needed, equivalent to `Booking-Inform` with [[none, none]] + # args in restaurant/hotel domain + acts.add( + "OfferBooked" + ) # only for `train` domain, inform booking is complete, with corresponding info (such as ref number) + acts.add("Book") # inform booking is successful, equivalent to `OfferBooked` above + acts.add( + "NoBook" + ) # inform booking fails, might because of no availability, usually come together act `request` + acts.add("bye") + acts.add("greet") + acts.add("reqmore") + acts.add("welcome") + acts.add("thank") + return acts + + +def get_act_natural_language(act): + if act in ["bye", "greet", "reqmore", "welcome", "thank"]: + return act + + assert act[0].isupper() + tokens = re.findall("[A-Z][^A-Z]*", act) # e.g., `FindEvents` -> `Find Events` + tokens = list(map(str.lower, tokens)) # lower case, -> `find events` + act_nl = " ".join(tokens) + return act_nl + + +def convert_act_into_sgd(act, SPECIAL_TOKENS): + # TODO: check inference result to see if mapping on NoOffer, OfferBook and NoBook are fine + """ + convert multiwoz acts (w/o domain info) into sgd acts ensure that acts with same concept use one name + e.g., Book (OfferBooked) -> NOTIFY_SUCCESS, NoBook -> NOTIFY_FAILURE + """ + if act == "NoOffer": + act = "NOTIFY_FAILURE" + + elif act == "Recommend": + act = "OFFER" + + # technically, `OfferBook` is equivalent to (`act=OFFER_INTENT, slot=intent, value=ReserveRestaurant`) + # on system side in sgd since (1) the conversion is not trivial (completely different representations) + # and (2) multiwoz has no slot called `intent` one cannot simply convert `OfferBook` to `OFFER_INTENT` + # we thus keep the act as is + # note that there is no slot `intent` and value conveying intents in multiwoz + elif act == "OfferBook": + act = "Offer_Book" + + elif act == "OfferBooked": + act = "NOTIFY_SUCCESS" + + elif act == "Book": # same as `OfferBooked` + act = "NOTIFY_SUCCESS" + + elif act == "NoBook": + act = "NOTIFY_FAILURE" + + elif act == "bye": + act = "GOODBYE" + + elif act == "reqmore": + act = "REQ_MORE" + + elif act == "thank": + act = "THANK_YOU" + # elif act == "greet": + # elif act == "welcome": + act = act.upper() # align with sgd acts, e.g., `Inform` -> `INFORM` + + # check if valid + assert "_{}_".format(act) in SPECIAL_TOKENS["additional_special_tokens"] + return act + + +def load_schema(schema_file): + def _update(key, value, mapping): + if key in mapping: + assert ( + value == mapping[key] + ) # ensure service meta is the same between data splits + else: + mapping[key] = value + + def _restructure_service_meta(service_meta, attribute): + """ "convert slot/intent metadata list into dict(slot/intent=metadata)""" + assert attribute in ["slots", "intents"] + mapping = {} + for value in service_meta[attribute]: + key = value["name"] + if attribute == "slots": # domain-slot in multiwoz + assert "-" in key + _, key = key.split("-") # domain, slot + key = normalise_slot(key) + else: # intent + key = normalise_intent(key) + mapping[key] = value + service_meta[attribute] = mapping + + with open(schema_file) as f: + data = json.load(f) + + SERVICE2META = {} + SLOTS, INTENTS = set(), set() + for service_meta in data: + service = service_meta["service_name"] + _restructure_service_meta(service_meta, "slots") + _restructure_service_meta(service_meta, "intents") + _update(service, service_meta, SERVICE2META) + + # collect domain-independent slots + # for domain_slot in service_meta["slots"]: + # assert "-" in domain_slot + # domain, slot = domain_slot.split("-") + # slot = normalise_slot(slot) + # SLOTS.add(slot) + for slot in service_meta["slots"]: + SLOTS.add(slot) + + for intent in service_meta["intents"]: + # intent = normalise_intent(intent) + INTENTS.add(intent) + + print("Load schema, intents: {}, slots: {}".format(len(INTENTS), len(SLOTS))) + return SERVICE2META, INTENTS, SLOTS + + +def normalise_intent(intent): + """convert intent into natural language, e.g., find_hotel -> find hotel""" + if intent == "police": + intent = "find_police" + if intent == "book_taxi": + intent = "find_taxi" + assert "_" in intent + return " ".join(intent.split("_")) + + +def normalise_slot(slot): + if slot == "pricerange": + return "price range" + + elif slot == "bookday": + return "book day" + + elif slot == "bookpeople": + return "book people" + + elif slot == "booktime": + return "book time" + + elif slot == "bookstay": + return "book stay" + + elif slot == "ref": + return "reference" + + elif slot == "arriveby": + return "arrive by" + + elif slot == "leaveat": + return "leave at" + + elif slot == "trainid": + return "train id" + + elif slot == "openhours": + return "open hours" + + elif slot == "entrancefee": + return "entrance fee" + + elif slot in ["none", "?"]: + # return "_Empty_" # special token mark will be added during sequence linearlisation + return "Empty" + + else: + return slot + + +def normalise_value(value): + # deal with binary and empty values + if value == "yes": + # return "_True_" + return "True" + + elif value == "no": + # return "_False_" + return "False" + + elif value in ["none", "?"]: + # return "_Empty_" + return "Empty" + + # if value == "swimmingpool": # for simplicity, dont split + # return "swimming pool" + + else: + return value + + +def wrap_element(content_type, content): + """ + wrap elements such as slot, value, e.g., slot + """ + assert "/" not in content_type + return "<{}/> {} ".format(content_type, content, content_type) + + +def add_str(str1, str2): + return str1 + " " + str2 + + +def find_segment(gen, tag): + assert isinstance(gen, str) + gen = gen.split() + try: + start = gen.index("<{}/>".format(tag)) + 1 + end = gen.index("".format(tag)) + segment = " ".join(gen[start:end]) + except Exception: + print("Missing {} tag in generated sequence".format(tag)) + segment = None + return segment + + +class bcolors: + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKCYAN = "\033[96m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" diff --git a/scripts/user_model_code/main_user_model.py b/scripts/user_model_code/main_user_model.py new file mode 100644 index 0000000000000000000000000000000000000000..120c16523bd673782b66f73f2cba40d66642fe40 --- /dev/null +++ b/scripts/user_model_code/main_user_model.py @@ -0,0 +1,347 @@ +import json +import random +import sys +import time + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler +from tqdm import tqdm +from transformers import ( + AdamW, + GPT2Config, + GPT2LMHeadModel, + GPT2Tokenizer, + get_linear_schedule_with_warmup, +) + +import wandb +from crazyneuraluser.user_model_code.argument import get_args + +# from interact import interact +from crazyneuraluser.user_model_code.dataset import SGD_Dataset +from crazyneuraluser.user_model_code.utils_generation import decode_e2e +from crazyneuraluser.user_model_code.utils_sgd import get_special_tokens + + +def print_loss(epoch, data_type, LOSS, t0): + print( + "Epoch: {} | {} loss: {:.3f} | time: {:.1f}".format( + epoch, data_type, LOSS, time.time() - t0 + ) + ) + + +def print_score(epoch, data_type, res, t0): + print( + "Epoch: {} | {}: joint_acc: {:.2f}%, slot_acc: {:.2f}% | time: {:.1f}".format( + epoch, + data_type, + res["avg_joint_acc"], + res["avg_slot_acc"], + time.time() - t0, + ) + ) + + +def run_one_epoch(data_type, dataloader, trainer, epoch, run_type, collector=None): + t0 = time.time() + assert data_type in ["dev", "test"] + assert run_type in ["teacher_force", "generation"] + model, optimizer, scheduler, tokenizer = trainer + + LOSS = 0 + # result = {"slot_acc": [], "joint_acc": []} + # mention_match = 0 + # coref_lines = [] + iterator = enumerate( + tqdm( + dataloader, + desc="Epoch {} {}".format(epoch, run_type), + disable=args.disable_display, + ) + ) + for step, batch in iterator: + if run_type == "teacher_force": + loss, logits, _ = model( + input_ids=batch["input_ids"], + attention_mask=batch["attention_mask"], + token_type_ids=batch["token_type_ids"], + labels=batch["label_ids"], + ).values() + LOSS += loss + else: + decode_e2e(args, batch, model, tokenizer, collector=collector) + + # print log + if run_type == "teacher_force": + LOSS /= step + 1 + print_loss(epoch, data_type, LOSS, t0) + return LOSS + else: # generation + # TODO: add evaluation code here + return None + + +def set_dataloader(args, tokenizer, data_type, run_type, data_size=-1): + dataset = SGD_Dataset( + args, tokenizer, data_type, run_type == "generation", data_size + ) + # sys.exit(1) + if data_type == "train": + sampler = RandomSampler( + dataset + ) # if args.local_rank == -1 else DistributedSampler(train_dataset) + else: + sampler = SequentialSampler(dataset) + + dataloader = DataLoader( + dataset, + sampler=sampler, + batch_size=args.train_batch_size + if data_type == "train" + else args.eval_batch_size, + collate_fn=dataset.collate_fn, + ) + return dataloader + + +def train(args, tokenizer, model): + + wandb.init( + # Set the project where this run will be logged + project="E2E User Simulator (Alistair)", + entity="byrne-lab", + # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) + name=args.wandb_train_run_name, + # Track hyperparameters and run metadata + config={ + "data_dir": args.data_dir, + "model_name": args.model_name, + "learning_rate": args.learning_rate, + "gradient_accumulation_steps": args.gradient_accumulation_steps, + "train_batch_size": args.train_batch_size, + "eval_batch_size": args.eval_batch_size, + }, + ) + + # load data + train_dataloader = set_dataloader( + args, tokenizer, "train", "teacher_force", data_size=args.train_size + ) + dev_dataloader = set_dataloader( + args, tokenizer, "dev", "teacher_force", data_size=args.eval_size + ) + + optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) + if args.use_scheduler: + t_total = ( + len(train_dataloader) // args.gradient_accumulation_steps * args.max_epoch + ) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) + else: + scheduler = None + trainer = (model, optimizer, scheduler, tokenizer) + + print("Do evaluation before training!") + model.eval() + with torch.no_grad(): + _ = run_one_epoch("dev", dev_dataloader, trainer, -1, "teacher_force") + + print("Start training!\n{}".format("***" * 30)) + eval_step = args.eval_interval // args.train_batch_size + best_score = -100 + global_step = 0 + no_improve_count = 0 + for epoch in range(args.max_epoch): + # initialize for each epoch training + t0 = time.time() + model.train() + model.zero_grad() + LOSS = 0 + iterator = enumerate( + tqdm( + train_dataloader, + desc="Epoch {}".format(epoch), + disable=args.disable_display, + ) + ) + for local_step, batch in iterator: + loss, logits, _ = model( + input_ids=batch["input_ids"], + attention_mask=batch["attention_mask"], + token_type_ids=batch["token_type_ids"], + labels=batch["label_ids"], + ).values() + LOSS += loss + global_step += 1 + + wandb.log({"loss": loss}) + + # update model + if loss != 0: + loss = loss / args.gradient_accumulation_steps + loss.backward() + + if global_step % args.gradient_accumulation_steps == 0: + # norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + optimizer.step() + if args.use_scheduler: + scheduler.step() + optimizer.zero_grad() + + # evaluate model + if global_step % eval_step == 0: + model.eval() + with torch.no_grad(): + loss = run_one_epoch( + "dev", dev_dataloader, trainer, epoch, "teacher_force" + ) + score = -loss # dev loss as criterion for early training + wandb.log({"dev_loss": loss}) + model.train() + + save_checkpoint( + args, tokenizer, model, global_step * args.train_batch_size + ) + if score > best_score: + best_score = score + print("Best score: {:.2f}".format(best_score)) + no_improve_count = 0 + else: + no_improve_count += 1 + + # early stop + if no_improve_count == args.no_improve_max: + print("Early stop!") + return + + LOSS /= local_step + 1 + print_loss(epoch, "train", LOSS, t0) + print("***" * 30) + + wandb.log({"epoch": epoch, "epoch_loss": LOSS}) + + # Mark the run as finished on wandb + wandb.finish() + + +def test(args, tokenizer, model): + # load data + test_gen_dataloader = set_dataloader(args, tokenizer, "test", "generation") + + trainer = (model, None, None, tokenizer) + model.eval() + collector = {"decode-dev": {}, "decode-test": {}} + with torch.no_grad(): + # # evaluate on dev + # _ = run_one_epoch('dev', dev_dataloader, trainer, 'Eval', 'teacher_force') + + # # generate on dev + # res_dev = run_one_epoch('dev', dev_gen_dataloader, trainer, 'Dev', 'generation', + # collector=collector['decode-dev']) + # collector['result-dev'] = res_dev + # print_qr_result(res_dev['qr'], 'dev') + + # generate on test + res_test = run_one_epoch( + "test", + test_gen_dataloader, + trainer, + "Test", + "generation", + collector=collector["decode-test"], + ) + collector["result-test"] = res_test + + out_file = args.decode_file + with open(out_file, "w") as f: + json.dump(collector, f, indent=4, sort_keys=True) + print("Decode file is saved at {}".format(out_file)) + print("Done decoding!") + + +def save_checkpoint(args, tokenizer, model, step): + save_path = args.checkpoint + "_step" + str(step) + print("Save model in {}!".format(save_path)) + tokenizer.save_pretrained(save_path) + model.save_pretrained(save_path) + + +def load_checkpoint(args): + save_path = args.checkpoint # + '_step' + str(args.step) + print("Load model, tokenizer from {}".format(save_path)) + tokenizer = GPT2Tokenizer.from_pretrained(save_path) + model = GPT2LMHeadModel.from_pretrained(save_path) + model.to(args.device) + return tokenizer, model + + +def load_pretrained_model(args): + save_path = args.pre_checkpoint + print("Load model, tokenizer from {}".format(save_path)) + tokenizer = GPT2Tokenizer.from_pretrained(save_path) + model = GPT2LMHeadModel.from_pretrained(save_path) + model.to(args.device) + return tokenizer, model + + +def set_model(args, SPECIAL_TOKENS): + """initiate config, tokenizer and model""" + # add special tokens into tokenizer + config = GPT2Config.from_pretrained(args.model_name_or_path) + tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path) + tokenizer.add_special_tokens(SPECIAL_TOKENS) + model = GPT2LMHeadModel.from_pretrained( + args.model_name_or_path, config=config + ) # GPT2LMHeadModel + model.resize_token_embeddings(len(tokenizer)) + model.to(args.device) + print("Done setting model") + return config, tokenizer, model + + +def set_seed(args): + """for reproduction""" + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enabled = False + torch.backends.cudnn.benchmark = False + + +if __name__ == "__main__": + # Load arguments + args = get_args() + + # Set seed, device + set_seed(args) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + args.device = device + + # Load special tokens + SPECIAL_TOKENS = get_special_tokens() + + if args.mode == "training": + config, tokenizer, model = set_model(args, SPECIAL_TOKENS) + train(args, tokenizer, model) + + elif args.mode == "finetune": + tokenizer, model = load_pretrained_model(args) + train(args, tokenizer, model) + + elif args.mode == "testing": + tokenizer, model = load_checkpoint(args) + test(args, tokenizer, model) + + # elif args.mode == 'interact': + # tokenizer, model = load_checkpoint(args) + # interact(args, tokenizer, model) + + else: + sys.exit(1) diff --git a/scripts/user_model_code/preprocess_multiwoz.py b/scripts/user_model_code/preprocess_multiwoz.py new file mode 100644 index 0000000000000000000000000000000000000000..cf708402a9f3bf3320e97b52a5a73df0398d8f36 --- /dev/null +++ b/scripts/user_model_code/preprocess_multiwoz.py @@ -0,0 +1,528 @@ +import json +import os +import sys + +from tqdm import tqdm + +from crazyneuraluser.user_model_code.analysis_multiwoz import DATA_SPLIT, collect_data +from crazyneuraluser.user_model_code.utils_multiwoz import ( + get_act_natural_language, + get_original_act_set, + load_schema, + normalise_intent, + normalise_slot, + normalise_value, +) +from crazyneuraluser.user_model_code.utils_sgd import ( + add_str, + compare_slot_values_in_state, + conv_special_token, + dict2list, + get_special_tokens, + wrap_element, +) + +""" pre-process script for MultiWOZ v2.2 """ + + +class DialMetaData: + def __init__(self, dial_id, dial_meta, dial_act, unify_act): + self.dial_id = dial_id + self.unify_act = unify_act + self.turn_meta_list, self.scenario = self.parse( + dial_meta, dial_act + ) # None for system turn + self.linearise_turns() + + def parse(self, dial_meta, dial_act): + global n, act_intent, non_intent + assert len(dial_meta["turns"]) == len(dial_act) + + turn_meta_list = [] + scenario = [] + sys_turn = None # dummy sys turn for first usr turn + prev_intent = "" + prev_usr_turn, prev_usr_turn_meta = ( + None, + None, + ) # dummpy for tracing goal change at first turn + for turn_id, turn in enumerate(dial_meta["turns"]): + assert turn_id == int(turn["turn_id"]) + + if turn["speaker"] == "SYSTEM": + sys_turn = turn + turn_meta_list.append(None) + continue + + # init turn meta + turn_meta = TurnMetaData( + prev_intent, sys_turn, turn, self.dial_id, self.unify_act + ) + + # get goal change label + turn_meta.get_goal_change_label(prev_usr_turn, prev_usr_turn_meta) + + # update previous goal + for prev_turn_meta in reversed(turn_meta_list): + if prev_turn_meta is None: + continue + prev_turn_meta.accumulate_constraints(turn_meta) # TODO: check goal + + # record task (intent) in scenario + if turn_meta.usr_intent not in scenario: + scenario.append(turn_meta.usr_intent) + + turn_meta_list.append(turn_meta) + prev_intent = turn_meta.usr_intent + prev_usr_turn, prev_usr_turn_meta = turn, turn_meta + assert len(turn_meta_list) == len(dial_meta["turns"]) + return turn_meta_list, scenario + + def linearise_turns(self): + # linearise necessary meterials + for turn_meta in self.turn_meta_list: + if turn_meta is None: + continue + turn_meta._linearise(self.scenario, SERVICE2META) + + +class TurnMetaData: + def __init__(self, prev_intent, sys_turn, usr_turn, dial_id, unify_act): + self.dial_id = dial_id + self.unify_act = unify_act + self.original_act_set = get_original_act_set() # act set w/o domain information + self.sys_turn, self.usr_turn = sys_turn, usr_turn + + # turn id + self.sys_turn_id, self.usr_turn_id = self._get_turn_id(sys_turn, usr_turn) + + # intent + self.usr_intent = normalise_intent(self._get_intent(usr_turn, prev_intent)) + if remove_book_intent: + self.usr_intent = self.usr_intent.replace("book", "find") + assert self.usr_intent in INTENTS # or self.usr_intent == "temp temp" + self.service = self.usr_intent.split()[1] + + # utterances + self.utt = {} + self.utt["sys"], self.utt["usr"] = self._get_utt(sys_turn), self._get_utt( + usr_turn + ) + + # act + self.act2sv = {} + self.act2sv["sys"], _ = self._parse_action(self.sys_turn_id, self.sys_turn) + self.act2sv["usr"], self.usr_constraints = self._parse_action( + self.usr_turn_id, self.usr_turn + ) + + # task boundary + self._get_new_task_label(prev_intent) + + # req_alts + self._get_req_alts_label() + + def _get_turn_id(self, sys_turn, usr_turn): + usr_turn_id = int(usr_turn["turn_id"]) # 0, 2, 4 ... + sys_turn_id = int(sys_turn["turn_id"]) if sys_turn is not None else -1 + assert sys_turn_id == (usr_turn_id - 1) + return sys_turn_id, usr_turn_id + + def _get_utt(self, turn): + if turn is None: + return "" + return turn["utterance"] + + def accumulate_constraints(self, new_turn_meta): + """ + Add slot, slot-value pairs from a given following turn + This function forms the user goal by accumulating constraints backward + """ + # only accumulate constraints with the same task/intent + if new_turn_meta.usr_intent != self.usr_intent: + return + + if ( + new_turn_meta.goal_change + ): # if goal changes at a new turn, these constraints should not be put in previous turns + return + + # only accumulate constraints without goal change + # if the value of a slot is changed (goal change) in a new turn, + # this slot-value pair is not part of initial goal and should not be added into the goal of previous turns + new_constraints = new_turn_meta.usr_constraints + self.usr_constraints["requestable"] = self.usr_constraints["requestable"].union( + new_constraints["requestable"] + ) + for slot, value_list in new_constraints["informable"].items(): + if slot not in self.usr_constraints["informable"]: + self.usr_constraints["informable"][slot] = value_list + + def get_goal_change_label(self, prev_usr_turn, prev_turn_meta): + """check if goal changed (value of slot changes) between two turn states""" + # first usr turn + if prev_usr_turn is None: + assert self.usr_turn_id == 0 + self.goal_change = False + return + + # last usr turn + if "GOODBYE" in self.act2sv["usr"] or "THANK_YOU" in self.act2sv["usr"]: + self.goal_change = False + return + + assert self.usr_turn_id != 0 + assert prev_usr_turn["speaker"] == "USER" + + # new task + if self.usr_intent != prev_turn_meta.usr_intent: + self.goal_change = False + return + + # compare two states to obtain goal change flag + curr_state, prev_state = None, None + for frame in self.usr_turn["frames"]: + if frame["service"] == self.service: + curr_state = frame["state"]["slot_values"] + + for frame in prev_usr_turn["frames"]: + if frame["service"] == prev_turn_meta.service: + prev_state = frame["state"]["slot_values"] + + # check if slot value has changed at current turn (new slot is not counted) + assert curr_state is not None and prev_state is not None + self.goal_change = compare_slot_values_in_state(curr_state, prev_state) + + def _get_domain_from_act(self, dialogue_act): + """ + parse the raw dialouge act annotation to get domain info + number of doamin can be more than 1 for multi-domain turns + """ + domains = set() + book_flag = False + for dact, sv_pairs in dialogue_act.items(): + assert "-" in dact + domain, _ = dact.split("-") + if domain not in ["Booking", "general"]: + domains.add(domain) + for slot, value in sv_pairs: + if "book" in slot: # e.g., bookday + book_flag = True + return domains, book_flag + + def _get_intent(self, usr_turn, prev_intent): + intents = [] + for frame in usr_turn["frames"]: + # service = frame["service"] + intent = frame["state"]["active_intent"] + if intent != "NONE": + intents.append(intent) + + if len(intents) == 1: + intent = intents[0] + if intent == "find_taxi": + intent = "book_taxi" + return intent # tackle 51.5k out of 71.5k user turns + + # if above fails (e.g., due to wrong label), leverage usr act to help determine main intent/service + # possible domains in da: {'Hospital', 'Taxi', 'Train', 'Police', 'Restaurant', 'Booking', 'general', + # 'Attraction', 'Hotel'} + usr_act = data_act[self.dial_id][str(self.usr_turn_id)]["dialog_act"] + domains, book_flag = self._get_domain_from_act(usr_act) + if len(domains) == 1: + domain = list(domains)[0].lower() + if book_flag and domain in ["restaurant", "hotel", "train"]: + intent = "book_{}".format(domain) + elif domain == "taxi": + intent = "book_{}".format(domain) + else: + intent = "find_{}".format(domain) + return intent # tackle 58.1k out of 71.5k user turns + + if "Taxi" in domains: + return "book_taxi" # tackle 58.8k out of 71.5k user turns + + if ( + self.usr_turn_id == 0 + ): # wrong label at first turn, no previous intent to use, only 136 user turns here + utt = usr_turn["utterance"] + if ( + "restaurant" in utt + or "Restaurant" in utt + or "eat" in utt + or "din" in utt + ): + return "find_restaurant" + elif ( + "hotel" in utt + or "room" in utt + or "house" in utt + or "stay" in utt + or "live" in utt + ): + return "find_hotel" + else: + return "find_attraction" # tackle 58.9k out of 71.5k user turns + + else: # not first turn, leverage sys act to help decide intent + sys_act = data_act[self.dial_id][str(self.sys_turn_id)]["dialog_act"] + sys_domains, _ = self._get_domain_from_act(sys_act) + if len(sys_domains) == 1: + domain = list(sys_domains)[0].lower() + if book_flag and domain in ["restaurant", "hotel", "train"]: + intent = "book_{}".format(domain) + elif domain == "taxi": + intent = "book_{}".format(domain) + else: + intent = "find_{}".format(domain) + return intent # tackle 67.3k out of 71.5k user turns + + # two cases left enter here + # 1. turns with only general act, e.g., bye + # 2. turns have multiple intents (very few) + # both will be handled using previous intent + assert prev_intent != "" + intent = "_".join( + prev_intent.split() + ) # as prev_intent has been normalised already + return intent + + def _parse_action(self, turn_id, turn): + """parse the `dialog_act` field in `dialog_acts.json` + + Returns: + act2sv: act to slot value pairs, {act=sv}; sv: slot to value list, {slot=[v1, v2]} + """ + act2sv = dict() + constraints = {"informable": dict(), "requestable": set()} + if turn is None: + return None, constraints + + # get da from data_act + dialogue_act = data_act[self.dial_id][str(turn_id)]["dialog_act"] + # domains = set() + for dact, svs in dialogue_act.items(): + assert "-" in dact + if self.unify_act: # will use only act part without domain info + domain, act = dact.split( + "-" + ) # split `domain-act`, e.g., `hotel-inform` -> hotel, inform + else: # keep original mwoz act + act = dact # use act with domain info + + if self.unify_act: + # unify act: `Booking-Inform` with no args is equivalent to `OfferBook` in train domain + if dact == "Booking-Inform" and svs == [["none", "none"]]: + act = "OfferBook" + + # deal with act + if self.unify_act: + assert act in self.original_act_set + if turn["speaker"] == "USER": + assert act in ["Inform", "Request", "bye", "thank", "greet"] + act = get_act_natural_language(act) + + if act not in act2sv: + act2sv[act] = dict() + + # iterate slot value pairs + for slot, value in svs: + slot = normalise_slot(slot) + value = normalise_value(value) + + # act to slot value pairs + # NOTE: same slot might appear more than once per turn, e.g., when the system informs two hotels with + # their addresses so a value list is stored for each slot + if slot not in act2sv[act]: + act2sv[act][slot] = [] + act2sv[act][slot].append(value) + + # collect constraints + if act in ["REQUEST", "Request", "request"]: + constraints["requestable"].add(slot) + else: + if slot != "Empty": + if ( + slot not in constraints["informable"] + ): # NOTE: same reason as act, value list per slot + constraints["informable"][slot] = [] + constraints["informable"][slot].append(value) + return act2sv, constraints + + def _linearise(self, scenario, service2meta): + self.linear_act = {} + self.linear_act["sys"] = self._linearise_act(self.act2sv["sys"]) + self.linear_act["usr"] = self._linearise_act(self.act2sv["usr"]) + self.linear_goal = self._linearise_goal( + self.usr_constraints, scenario, service2meta + ) + + def _linearise_goal(self, constraints, scenario, service2meta): + """ + linearise goal representation which consists of several parts: + scenario, task (intent), task description, constraints with informable and requestable + e.g., task1 task2 .. + current task task description + slot1 value1 .. + slot1 slot2 .. + """ + res = "" + # scenario + assert isinstance(scenario, list) and len(scenario) > 0 + scenario = " ".join( + [wrap_element("INTENT", intent) for intent in scenario] + ) # treat intent as nl + scenario_wrap = wrap_element("SCENARIO", scenario) + res = add_str(res, scenario_wrap) + + # task name + intent = self.usr_intent + assert intent in scenario + intent_wrap = wrap_element("TASK", intent) + res = add_str(res, intent_wrap) + + # task description + description = service2meta[self.service]["intents"][intent]["description"] + description_warp = wrap_element("DESC", description) + res = add_str(res, description_warp) + + # informable + informable = dict2list( + constraints["informable"] + ) # sorted sv pair list [slot=value] + res = add_str(res, "") + for sv_pair in informable: + slot, value = sv_pair.split("=") + if value in ["True", "False", "Empty"]: + value = conv_special_token(value, SPECIAL_TOKENS) + if slot in ["Empty"]: + slot = conv_special_token(slot, SPECIAL_TOKENS) + # slot + slot_wrap = wrap_element("SLOT", slot) + res = add_str(res, slot_wrap) + # value + value_wrap = wrap_element("VALUE", value) + res = add_str(res, value_wrap) + res = add_str(res, "") + + # requestable + requestable = sorted( + list(constraints["requestable"]) + ) # sorted slot list [slot] + res = add_str(res, "") + for slot in requestable: + slot_wrap = wrap_element("SLOT", slot) + res = add_str(res, slot_wrap) + res = add_str(res, "") + return res[1:] # remove first space + + def _linearise_act(self, act2sv): + """ + NOTE: 1) split slot/value if "_"; 2) special tokens of acts; 3) empty slot or empty value + NOTE: filer too many values (e.g., 10 movie names) but make sure the one the user chose is present + + Return: ordered (slots sorted within act, acts sorted) linearised act sequence, + e.g., area Cambridge ... + e.g., _Empty_ _Empty_ + """ + res = "" + if act2sv is None: + return res + + for act in sorted(act2sv.keys()): # sort act + sv = act2sv[act] # dict{slot: value_list} + act_wrap = wrap_element("ACT", act) + res = add_str(res, act_wrap) + + sorted_sv = dict2list( + sv + ) # sorted sv list, [s1=v1, s2=v2], note slot can repeat + for sv_pair in sorted_sv: + slot, value = sv_pair.split("=") + if value in ["True", "False", "Empty"]: + value = conv_special_token(value, SPECIAL_TOKENS) + if slot in ["Empty"]: + slot = conv_special_token(slot, SPECIAL_TOKENS) + + # slot + slot_wrap = wrap_element("SLOT", slot) + res = add_str(res, slot_wrap) + + # value + value_wrap = wrap_element("VALUE", value) + res = add_str(res, value_wrap) + + return res[1:] # remove first space + + def _get_new_task_label(self, prev_intent): + """ + get a binary label indicating if a turn starts a new task (intent) in dialogue + """ + assert prev_intent != "NONE" and self.usr_intent != "NONE" + if self.usr_intent != prev_intent: + self.start_new_task = True + else: + self.start_new_task = False + + def _get_req_alts_label(self): + self.req_alts = False # no request alternative in mwoz + + +def collect_examples(dial_id, dial_meta, examples): + num = 0 + examples[dial_id] = {} + for turn_meta in dial_meta.turn_meta_list: + if turn_meta is None: # sys turn + continue + + example_id = "{}-{}".format(dial_id, num) + example = { + "utterances": turn_meta.utt, + "actions": turn_meta.linear_act, + "goal": turn_meta.linear_goal, + "service": turn_meta.service, + "intent": turn_meta.usr_intent, + "goal_change": turn_meta.goal_change, + "start_new_task": turn_meta.start_new_task, + "req_alts": turn_meta.req_alts, + } + examples[dial_id][example_id] = example + num += 1 + + +def prepare_data_seq(unify_act, out_data_path): + for split in DATA_SPLIT: + examples = {} + for dial_num, dial_id in enumerate(tqdm(sorted(data[split].keys()))): + dial = data[split][dial_id] + dial_act = data_act[dial_id] + + dial_meta = DialMetaData(dial_id, dial, dial_act, unify_act) + collect_examples(dial_id, dial_meta, examples) + + with open("{}/{}.json".format(out_data_path, split), "w") as f: + json.dump(examples, f, sort_keys=True, indent=4) + print("Done process {} {} dialogues".format(split, len(examples))) + + +if __name__ == "__main__": + if len(sys.argv) == 1: + print("Wrong argument!") + print("usage: python utils/preprocess_multiwoz.py multiwoz2.2-data-path") + sys.exit(1) + + # Set data path + data_path = sys.argv[1] + out_data_path = "./data/preprocessed/user_model" + os.makedirs(out_data_path, exist_ok=True) + + # Control flags + unify_act = True + remove_book_intent = True + + # Load data and material as global var + SERVICE2META, INTENTS, SLOTS = load_schema(os.path.join(data_path, "schema.json")) + SPECIAL_TOKENS = get_special_tokens() + data, data_act = collect_data(data_path, remove_dial_switch=False) + + prepare_data_seq(unify_act, out_data_path) diff --git a/scripts/user_model_code/preprocess_sgd.py b/scripts/user_model_code/preprocess_sgd.py new file mode 100644 index 0000000000000000000000000000000000000000..451af9d8341b1ad7eae17966f3291f73cefc677f --- /dev/null +++ b/scripts/user_model_code/preprocess_sgd.py @@ -0,0 +1,431 @@ +import json +import os +import sys + +from tqdm import tqdm + +from crazyneuraluser.user_model_code.analysis_sgd import DATA_SPLIT, collect_data +from crazyneuraluser.user_model_code.utils_sgd import ( + add_str, + compare_slot_values_in_state, + dict2list, + get_special_tokens, + get_turn_intent, + load_schema, + split_intent, + wrap_element, +) + +"""pre-processing script for SGD + +The annotations for a turn are grouped into frames, where each frame corresponds to a single service +The values of "slot_values" in user "state" is a list, where spoken variations are considered, e.g., tomorrow, 8/2 +""" + + +class DialMetaData: + def __init__(self, dial_id, dial): + self.dial_id = dial_id + self.turn_meta_list, self.scenario = self.parse(dial) # None for system turn + self.linearise_turns() + + def parse(self, dial): + turn_meta_list = [] + scenario = [] + sys_turn = None # dummy sys turn for first usr turn + prev_intent = "" + prev_usr_turn, prev_usr_turn_meta = ( + None, + None, + ) # dummpy for tracing goal change at first turn + for turn_id, turn in enumerate(dial["turns"]): + if turn["speaker"] == "SYSTEM": + sys_turn = turn + turn_meta_list.append(None) + continue + + # init turn meta + turn_meta = TurnMetaData(prev_intent, sys_turn, turn, self.dial_id) + + # get goal change label + turn_meta.get_goal_change_label(prev_usr_turn, prev_usr_turn_meta) + + # update previous goal + for prev_turn_meta in reversed(turn_meta_list): + if prev_turn_meta is None: + continue + prev_turn_meta.accumulate_constraints(turn_meta) + + # record task (intent) in scenario + prev_intent = turn_meta.usr_intent + if turn_meta.usr_intent not in scenario: + scenario.append(turn_meta.usr_intent) + + turn_meta_list.append(turn_meta) + prev_usr_turn, prev_usr_turn_meta = turn, turn_meta + + assert len(turn_meta_list) == len(dial["turns"]) + return turn_meta_list, scenario + + def linearise_turns(self): + # linearise necessary meterials + for turn_meta in self.turn_meta_list: + if turn_meta is None: + continue + turn_meta._linearise(self.scenario) + + +class TurnMetaData: + def __init__(self, prev_intent, sys_turn, usr_turn, dial_id): + self.dial_id = dial_id + self.sys_turn, self.usr_turn = sys_turn, usr_turn + self.empty_token = "_Empty_" + assert self.empty_token in SPECIAL_TOKENS["additional_special_tokens"] + + # intent + self.usr_intent, self.service = self._get_intent(usr_turn, prev_intent) + + # utterances + self.utt = {} + self.utt["sys"], self.utt["usr"] = self._get_utt(sys_turn), self._get_utt( + usr_turn + ) + + # action + self.act2sv = {} + self.act2sv["sys"], _ = self._parse_action(sys_turn) + self.act2sv["usr"], self.usr_constraints = self._parse_action(usr_turn) + + # task boundary + self._get_new_task_label(prev_intent) + + # req_alts + self._get_req_alts_label(self.act2sv["usr"]) + + def _get_intent(self, turn, prev_intent): + """manually set the `NONE` intent to the intent of previous turn""" + active_intent, service = get_turn_intent( + turn + ) # intent annotation (migt be `NONE`) + if active_intent == "NONE": + active_intent = prev_intent + return active_intent, service + + def _get_utt(self, turn): + if turn is None: + return "" + return turn["utterance"] + + def _parse_action(self, turn): + """ + parse action annotation to collect turn level information + 1) act to slot-value pairs, dict{act: {slot: value}} + 2) turn level constraints, dict{'informable': dict{slot: value}, 'requestable': set(slot)} + """ + # get mapping from act to slot-value pairs + act2sv = {} + info_req = {"informable": dict(), "requestable": set()} # constraints + + if turn is None: + return None, info_req + + for frame in turn["frames"]: + for action in frame["actions"]: + act, slot, values = action["act"], action["slot"], action["values"] + + # deal with empty slot or value + if turn["speaker"] == "USER": + assert len(values) in [0, 1] + if slot == "": + slot = self.empty_token + value = values[0] if len(values) > 0 else self.empty_token + + # act to slot-value pairs + if act not in act2sv: + act2sv[act] = {} + assert slot not in act2sv[act] + act2sv[act][slot] = value + + # collect constraints + if slot in [ + "", + self.empty_token, + ]: # only act but no constraints, e.g., AFFIRM, NEGATE + continue + + # turn level informalable and requestable info + if act == "REQUEST": + assert slot != "" + info_req["requestable"].add(slot) + else: + if turn["speaker"] == "USER": + assert act in [ + "INFORM_INTENT", + "INFORM", + "SELECT", + ] # not apply to system side + if ( + act != "SELECT" + ): # result offered by system is part of initial user goal + assert slot not in info_req["informable"] + info_req["informable"][slot] = value + return act2sv, info_req + + def accumulate_constraints(self, new_turn_meta): + """ + Add slot, slot-value pairs from a given following turn + This function is used to form user goal by accumulating constraints backward + """ + # only accumulate constraints with the same task/intent + if new_turn_meta.usr_intent != self.usr_intent: + return + + if ( + new_turn_meta.goal_change + ): # if goal changes at a new turn, these constraints should not be put in previous turns + return + + # only accumulate constraints without goal change + # if the value of a slot is changed (goal change) in a new turn, + # this slot-value pair is not part of initial goal and should not be added into the goal of previous turns + new_constraints = new_turn_meta.usr_constraints + self.usr_constraints["requestable"] = self.usr_constraints["requestable"].union( + new_constraints["requestable"] + ) + for slot, value in new_constraints["informable"].items(): + if slot not in self.usr_constraints["informable"]: + self.usr_constraints["informable"][slot] = value + + def _get_new_task_label(self, prev_intent): + """get a binary label indicating if a turn starts a new task (intent) in dialogue""" + assert prev_intent != "NONE" and self.usr_intent != "NONE" + if self.usr_intent != prev_intent: + self.start_new_task = True + else: + self.start_new_task = False + + def _get_req_alts_label(self, act2sv): + """get a binary label indicating if usr requests alternatives""" + if "REQUEST_ALTS" in act2sv: + self.req_alts = True + else: + self.req_alts = False + + def get_goal_change_label(self, prev_usr_turn, prev_turn_meta): + """check if goal changed (value of slot changes) between two turn states""" + if prev_usr_turn is None: # first usr turn + self.goal_change = False + return + + if ( + len(self.usr_turn["frames"]) == 1 + and self.usr_turn["frames"][0]["state"]["active_intent"] == "NONE" + ): # `NONE` intent + self.goal_change = False + return + + if self.usr_intent != prev_turn_meta.usr_intent: # new task + self.goal_change = False + return + + assert prev_usr_turn["speaker"] == "USER" + prev_state_sv, curr_state_sv = None, None + for frame in prev_usr_turn["frames"]: + if frame["state"]["active_intent"] == self.usr_intent: + prev_state_sv = frame["state"]["slot_values"] + + # fix some weird cases (count very few, around 30 turns) + if prev_state_sv is None: + assert ( + len(prev_usr_turn["frames"]) == 1 + and prev_usr_turn["frames"][0]["state"]["active_intent"] == "NONE" + ) + prev_state_sv = prev_usr_turn["frames"][0]["state"]["slot_values"] + + for frame in self.usr_turn["frames"]: + if frame["state"]["active_intent"] == self.usr_intent: + curr_state_sv = frame["state"]["slot_values"] + + assert prev_state_sv is not None and curr_state_sv is not None + self.goal_change = compare_slot_values_in_state( + prev_state_sv, curr_state_sv + ) # True if goal changes + + def _linearise(self, scenario): + self.linear_act = {} + self.linear_act["sys"] = self._linearise_act(self.act2sv["sys"]) + self.linear_act["usr"] = self._linearise_act(self.act2sv["usr"]) + self.linear_goal = self._linearise_goal(self.usr_constraints, scenario) + + def _linearise_act(self, act2sv): + """ + NOTE: 1) split slot/value if "_"; 2) special tokens of acts; 3) empty slot or empty value + NOTE: filer too many values (e.g., 10 movie names) but make sure the one the user chose is present + + Return: ordered (slots sorted within act, acts sorted) linearised act sequence, + e.g., area Cambridge ... + e.g., _Empty_ _Empty_ + """ + res = "" + if act2sv is None: + return res + + for act in sorted(act2sv.keys()): # sort act + sv = act2sv[act] # dict{slot: value} + + act = "_{}_".format(act) # act is special token + assert act in SPECIAL_TOKENS["additional_special_tokens"] + act_wrap = wrap_element("ACT", act) + res = add_str(res, act_wrap) + + sorted_sv = dict2list(sv) # sorted sv list, [slot=value] + for sv_pair in sorted_sv: + slot, value = sv_pair.split("=") + slot, value = self._basic_normalise_slot( + slot + ), self._basic_normalise_value(value, slot) + + # slot + slot_wrap = wrap_element("SLOT", slot) + res = add_str(res, slot_wrap) + + # value + value_wrap = wrap_element("VALUE", value) + res = add_str(res, value_wrap) + return res[1:] # remove first space + + def _basic_normalise_value(self, value, slot): + # intent value + if slot == "intent": + value = split_intent(value) + return value + + # special token value + if value in ["True", "False"]: # Empty is already in the form of "_Empty_" + value = "_{}_".format(value) + assert value in SPECIAL_TOKENS["additional_special_tokens"] + return value + return value + + def _basic_normalise_slot(self, slot): + if slot not in SPECIAL_TOKENS["additional_special_tokens"]: + slot = slot.replace( + "_", " " + ) # e.g., `date_of_journey` -> `date of journey` + return slot + + def _linearise_goal(self, constraints, scenario): + """ + linearise goal representation which consists of several parts: + scenario, task (intent), task description, constraints with informable and requestable + e.g., task1 task2 .. + current task task description + slot1 value1 .. + slot1 slot2 .. + """ + res = "" + # scenario + assert isinstance(scenario, list) and len(scenario) > 0 + scenario = " ".join( + [wrap_element("INTENT", split_intent(intent)) for intent in scenario] + ) + scenario_wrap = wrap_element("SCENARIO", scenario) + res = add_str(res, scenario_wrap) + + # task name + intent = split_intent(self.usr_intent) + assert intent in scenario + intent_wrap = wrap_element("TASK", intent) + res = add_str(res, intent_wrap) + + # task description + description = SERVICE2META[self.service]["intents"][self.usr_intent][ + "description" + ] + description_warp = wrap_element("DESC", description) + res = add_str(res, description_warp) + + # informable + informable = dict2list( + constraints["informable"] + ) # sorted sv pair list [slot=value] + res = add_str(res, "") + for sv_pair in informable: + slot, value = sv_pair.split("=") + slot, value = self._basic_normalise_slot(slot), self._basic_normalise_value( + value, slot + ) + # slot + slot_wrap = wrap_element("SLOT", slot) + res = add_str(res, slot_wrap) + # value + value_wrap = wrap_element("VALUE", value) + res = add_str(res, value_wrap) + res = add_str(res, "") + + # requestable + requestable = sorted( + list(constraints["requestable"]) + ) # sorted slot list [slot] + res = add_str(res, "") + for slot in requestable: + slot = self._basic_normalise_slot(slot) + slot_wrap = wrap_element("SLOT", slot) + res = add_str(res, slot_wrap) + res = add_str(res, "") + return res[1:] # remove first space + + +def collect_examples(dial_id, dial_meta, examples): + num = 0 + examples[dial_id] = {} + for turn_meta in dial_meta.turn_meta_list: + if turn_meta is None: # sys turn + continue + + example_id = "{}-{}".format(dial_id, num) + example = { + "utterances": turn_meta.utt, + "actions": turn_meta.linear_act, + "goal": turn_meta.linear_goal, + "service": turn_meta.service, + "intent": turn_meta.usr_intent, + "goal_change": turn_meta.goal_change, + "start_new_task": turn_meta.start_new_task, + "req_alts": turn_meta.req_alts, + } + examples[dial_id][example_id] = example + num += 1 + + +def prepare_data_seq(data, out_data_path): + for split in DATA_SPLIT: + examples = {} + for dial_num, dial_id in enumerate(tqdm(sorted(data[split].keys()))): + dial = data[split][dial_id] + dial_meta = DialMetaData(dial_id, dial) + collect_examples(dial_id, dial_meta, examples) + + with open("{}/{}.json".format(out_data_path, split), "w") as f: + json.dump(examples, f, sort_keys=True, indent=4) + print("Done process {} {} dialogues".format(split, len(examples))) + + +if __name__ == "__main__": + if len(sys.argv) == 1: + print("wrong arguments!") + print("usage: python utils/preprocess_sgd.py sgd-data-path") + sys.exit(1) + + # Set data path + data_path = sys.argv[1] + out_data_path = "./processed_data/sgd/" + os.makedirs(out_data_path, exist_ok=True) + + # Load data and material as global var + SERVICE2META, INTENTS, SLOTS = load_schema(data_path) + SPECIAL_TOKENS = get_special_tokens() + data = collect_data(data_path, remove_dial_switch=True) + + # Process data + prepare_data_seq(data, out_data_path) diff --git a/scripts/user_model_code/train.sh b/scripts/user_model_code/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..922b069e358fe579fd43f315fe3a8525edc17293 --- /dev/null +++ b/scripts/user_model_code/train.sh @@ -0,0 +1,51 @@ +experiment=$1 + +# common setup +wandb_train_run_name="Full-user-model-training" +bs=16 # batch size for training +grad_step=2 # accumulated gradient steps +max_epoch=8 # max epoch for training +data_dir="./data/preprocessed/user_model" +train_size=-1 # number of examples used for training, -1 means all +eval_size=-1 # number of examples ued for evaluation, -1 means all + + + +if [[ "$experiment" == "SGD" ]]; then + echo "Conduct experiment with SGD dataset" + job_name='SGD-full' + data_list="sgd" # 165k training examples + eval_interval=50000 # evaluation interval + +elif [[ "$experiment" == "MultiWOZ" ]]; then + echo "Conduct experiment with MulwiWOZ dataset" + job_name='MultiWOZ-full' + data_list="multiwoz" # 56k training examples + eval_interval=20000 + +elif [[ "$experiment" == "Joint" ]]; then + echo "Conduct experiment with SGD + MulwiWOZ dataset" + job_name='Joint-full' + data_list="sgd multiwoz" # 221k training examples + eval_interval=70000 + +else + echo "Unrecognised argument" + exit +fi + +mkdir -p checkpoint log +checkpoint='checkpoint/'$job_name +log='log/'$job_name'.log' +python ./scripts/user_model_code/main_user_model.py --mode='training' \ + --wandb_train_run_name=$wandb_train_run_name \ + --model_name=$job_name \ + --checkpoint=$checkpoint \ + --data_dir=$data_dir \ + --data_list $data_list \ + --train_size=$train_size \ + --eval_size=$eval_size \ + --eval_interval=$eval_interval \ + --gradient_accumulation_steps=$grad_step \ + --train_batch_size=$bs \ + --max_epoch=$max_epoch diff --git a/src/crazyneuraluser.egg-info/PKG-INFO b/src/crazyneuraluser.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..773c581ad2fbd36df72e24a20c5dc118528a8744 --- /dev/null +++ b/src/crazyneuraluser.egg-info/PKG-INFO @@ -0,0 +1,173 @@ +Metadata-Version: 2.1 +Name: crazyneuraluser +Version: 0.0.post1.dev47+g049b138.d20220509 +Summary: Add a short description here! +Home-page: https://github.com/pyscaffold/pyscaffold/ +Author: Extended by Alistair McLeay, original code by Alexandru Coca +Author-email: am@alistairmcleay.com and alexcoca23@yahoo.co.uk +License: MIT +Project-URL: Documentation, https://pyscaffold.org/ +Platform: any +Classifier: Development Status :: 4 - Beta +Classifier: Programming Language :: Python +Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM +Provides-Extra: testing +License-File: LICENSE.txt +License-File: AUTHORS.md + +# Cambridge Masters Project +Joint Learning of Practical Dialogue Systems and User Simulators + +## Environment setup + +1. Create an environment `crazyneuraluser` with the help of [conda] + ``` + conda env create -f environment.yml + ``` +2. Activate the new environment with: + ``` + conda activate crazyneuraluser + ``` +3. Install a version of `pytorch` compatible with your hardware (see the [pytorch website](https://pytorch.org/get-started/previous-versions/)). E.g.: + ``` + pip install torch --extra-index-url https://download.pytorch.org/whl/cu113 + ``` + +4. Install `spacy` and download the tokenization tool in spacy: + ``` + pip install spacy' + python -m spacy download en_core_web_sm + ``` + +### Generating dialogues through agent-agent interaction + +To generate dialogues, first change working directory to the `baselines` directory. Run the command + ``` + python baselines_setup.py + ``` +to prepare `convlab2` for running the baselines. + +#### Generating dialogues conditioned on randomly sampled goals + +Select one of the available configurations in the `configs` directory and run the command + ``` + python simulate_agent_interaction.py --config /rel/path/to/chosen/config + ``` +to generate dialogues conditioned on randomly sampled goals according to the `convlab2` goal model. The dialogues will be be saved automatically in the `models` directory, under a directory whose name depends on the configuration run. The `models` directory is located in the parent directory of the `baselines` directory. The `metadata.json` file saved with the dialogues contains information about the data generation process. + +#### Generating dialogues conditioned on `MultiWOZ2.1` goals + +To generate the entire corpus, simply pass the `--goals-path /path/to/multiwoz2.1/data.json/file` flag to `simulate_agent_interaction.py`. To generate the `test/val` split additionally pass the `--filter-path /path/to/multiwoz2.1/test-or-valListFile` argument to `simulate_agent_interaction.py`. You can use the `generate_multiwoz21_train_id_file` function in `baselines/utils.py` to generate `trainListFile` which can then be passed via the `--filter-path` argument to the dialogue generation script in order to generate dialogues conditioned on the `MultiWOZ2.1` training goals. + +### Converting the generated dialogues to SGD-like format + +The `create_data_from_multiwoz.py` script can be used to convert the generated dialogues to SGD format, necessary for evaluation. It is based on the script provided by Google for DSTC8, but with additional functionality such as: + + - conversion of slot names as annotated in the MultiWOZ 2.1 dialogue acts to different slot names, specified through the `--slots_convention` argument. Options are `multiwoz22` to convert the slots to the same slots as defined in the MultiWOZ 2.2 dataset whreas the `multiwoz_goals` converts the slot names to the names used in the dialogue goal and state tracking annotations. + + - addition of system and user `nlu` fields for every turn + + - option to perform cleaning operations on the goals to ensure a standard format is received by the evaluator. + +The conversion is done according to the `schema.json` file in the `baselines` directory, which is the same as used by `DSTC8` conversion except for the addition of the `police` domain. Type ``python create_data_from_multiwoz.py --helpfull`` to see a full list of flags and usage. + +## Installation + +The recommended way to use this repository is to develop the core code under `src/crazyneuraluser`. The experiments/exporatory analysis making use of the core package code should be placed outside the library and imported. See more guidance under the [Project Organisation](#project-organization) section below. + +To create an environment for the package, make sure you have deactivated all `conda` environments. Then: + +1. Create an environment `crazyneuraluser` with the help of [conda]: + ``` + conda env create -f environment.yml + ``` +2. Add the developer dependencies to this environment with the help of [conda]: + ``` + conda env update -f dev_environment.yml + ``` + +Optional and needed only once after `git clone`: + +3. install several [pre-commit] git hooks with: + ```bash + pre-commit install + # You _are encouraged_ to run `pre-commit autoupdate` + ``` + and checkout the configuration under `.pre-commit-config.yaml`. + The `-n, --no-verify` flag of `git commit` can be used to deactivate pre-commit hooks temporarily. + +4. install [nbstripout] git hooks to remove the output cells of committed notebooks with: + ```bash + nbstripout --install --attributes notebooks/.gitattributes + ``` + This is useful to avoid large diffs due to plots in your notebooks. + A simple `nbstripout --uninstall` will revert these changes. + +Then take a look into the `scripts` and `notebooks` folders. + +## Dependency Management & Reproducibility + +1. Always keep your abstract (unpinned) dependencies updated in `environment.yml` and eventually + in `setup.cfg` if you want to ship and install your package via `pip` later on. +2. Create concrete dependencies as `environment.lock.yml` for the exact reproduction of your + environment with: + ```bash + conda env export -n crazyneuraluser -f environment.lock.yml + ``` + For multi-OS development, consider using `--no-builds` during the export. +3. Update your current environment with respect to a new `environment.lock.yml` using: + ```bash + conda env update -f environment.lock.yml --prune + ``` +## Project Organization + +``` +├── AUTHORS.md <- List of developers and maintainers. +├── CHANGELOG.md <- Changelog to keep track of new features and fixes. +├── LICENSE.txt <- License as chosen on the command-line. +├── README.md <- The top-level README for developers. +├── configs <- Directory for configurations of model & application. +├── data +│ ├── external <- Data from third party sources. +│ ├── interim <- Intermediate data that has been transformed. +│ ├── processed <- The final, canonical data sets for modeling. +│ └── raw <- The original, immutable data dump. +├── docs <- Directory for Sphinx documentation in rst or md. +├── environment.yml <- The conda environment file for reproducibility. +├── models <- Trained and serialized models, model predictions, +│ or model summaries. +├── notebooks <- Jupyter notebooks. Naming convention is a number (for +│ ordering), the creator's initials and a description, +│ e.g. `1.0-fw-initial-data-exploration`. +├── pyproject.toml <- Build system configuration. Do not change! +├── references <- Data dictionaries, manuals, and all other materials. +├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. +│ └── figures <- Generated plots and figures for reports. +├── scripts <- Analysis and production scripts which import the +│ actual Python package, e.g. train_model.py. +├── setup.cfg <- Declarative configuration of your project. +├── setup.py <- Use `pip install -e .` to install for development or +| or create a distribution with `tox -e build`. +├── src +│ └── crazyneuraluser <- Actual Python package where the main functionality goes. +├── tests <- Unit tests which can be run with `py.test`. +├── .coveragerc <- Configuration for coverage reports of unit tests. +├── .isort.cfg <- Configuration for git hook that sorts imports. +└── .pre-commit-config.yaml <- Configuration of pre-commit git hooks. +``` + + + +## Note + +This project has been set up using [PyScaffold] 4.0.1 and the [dsproject extension] 0.6.1. + +[conda]: https://docs.conda.io/ +[pre-commit]: https://pre-commit.com/ +[Jupyter]: https://jupyter.org/ +[nbstripout]: https://github.com/kynan/nbstripout +[Google style]: http://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings +[PyScaffold]: https://pyscaffold.org/ +[dsproject extension]: https://github.com/pyscaffold/pyscaffoldext-dsproject + + diff --git a/src/crazyneuraluser.egg-info/SOURCES.txt b/src/crazyneuraluser.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae755e948d12729a5886812a1ae63d4da58a8946 --- /dev/null +++ b/src/crazyneuraluser.egg-info/SOURCES.txt @@ -0,0 +1,76 @@ +.coveragerc +.gitignore +.isort.cfg +.pre-commit-config.yaml +.readthedocs.yml +AUTHORS.md +CHANGELOG.md +CONVLAB_README.md +LICENSE.txt +README.md +baselines_environment.lock.yml +baselines_environment.yml +dev_environment.yml +environment.yml +pyproject.toml +setup.cfg +setup.py +tox.ini +baselines/__init__.py +baselines/_preprocess_raw_canonical_map.py +baselines/baseline_setup.py +baselines/canonical_map.json +baselines/canonical_map.py +baselines/correct_categorical_state_values.tsv +baselines/create_data_from_multiwoz.py +baselines/create_dbleu_reference_map.py +baselines/goal_new_values.json +baselines/sanity_checks.py +baselines/schema.json +baselines/simulate_agent_interaction.py +baselines/simulate_corpus_interaction.py +baselines/system_models.py +baselines/user_models.py +baselines/utils.py +baselines/configs/agent_agent.yaml +configs/.gitignore +data/.gitignore +data/external/.gitignore +data/interim/.gitignore +data/preprocessed/.gitignore +data/raw/.gitignore +docs/Makefile +docs/authors.md +docs/changelog.md +docs/conf.py +docs/index.md +docs/license.rst +docs/readme.md +docs/requirements.txt +docs/_static/.gitignore +models/.gitignore +notebooks/1.0-ac-goals_consistency_check.ipynb +notebooks/template.ipynb +references/.gitignore +reports/figures/.gitignore +scripts/data_analysis.py +scripts/preprocess.py +scripts/preprocess2.1.py +scripts/template_train_model.py +scripts/train_ubar.py +src/crazyneuraluser/__init__.py +src/crazyneuraluser/clean_dataset.py +src/crazyneuraluser/config.py +src/crazyneuraluser/config21.py +src/crazyneuraluser/db_ops.py +src/crazyneuraluser/eval.py +src/crazyneuraluser/ontology.py +src/crazyneuraluser/reader.py +src/crazyneuraluser/utils.py +src/crazyneuraluser.egg-info/PKG-INFO +src/crazyneuraluser.egg-info/SOURCES.txt +src/crazyneuraluser.egg-info/dependency_links.txt +src/crazyneuraluser.egg-info/not-zip-safe +src/crazyneuraluser.egg-info/requires.txt +src/crazyneuraluser.egg-info/top_level.txt +tests/conftest.py \ No newline at end of file diff --git a/src/crazyneuraluser.egg-info/dependency_links.txt b/src/crazyneuraluser.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/src/crazyneuraluser.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/crazyneuraluser.egg-info/not-zip-safe b/src/crazyneuraluser.egg-info/not-zip-safe new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/src/crazyneuraluser.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/src/crazyneuraluser.egg-info/requires.txt b/src/crazyneuraluser.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..76d0937d5858b90bcdf751972eda6e6fd4ee3243 --- /dev/null +++ b/src/crazyneuraluser.egg-info/requires.txt @@ -0,0 +1,15 @@ +transformers==4.18.0 +tqdm==4.64.0 +wandb==0.12.16 +nltk==3.7 +sklearn==0.0 +tensorboard==2.9.0 +spacy==3.3.0 + +[:python_version < "3.8"] +importlib-metadata + +[testing] +setuptools +pytest +pytest-cov diff --git a/src/crazyneuraluser.egg-info/top_level.txt b/src/crazyneuraluser.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..23688a9c911232254dc98bb735cd279b9fb842ba --- /dev/null +++ b/src/crazyneuraluser.egg-info/top_level.txt @@ -0,0 +1 @@ +crazyneuraluser diff --git a/src/crazyneuraluser/UBAR_code/__init__.py b/src/crazyneuraluser/UBAR_code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e451f103c4b557af9c3e33c60ada99aa3eb655c3 --- /dev/null +++ b/src/crazyneuraluser/UBAR_code/__init__.py @@ -0,0 +1,16 @@ +import sys + +if sys.version_info[:2] >= (3, 8): + # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` + from importlib.metadata import PackageNotFoundError, version # pragma: no cover +else: + from importlib_metadata import PackageNotFoundError, version # pragma: no cover + +try: + # Change here if project is renamed and does not equal the package name + dist_name = __name__ + __version__ = version(dist_name) +except PackageNotFoundError: # pragma: no cover + __version__ = "unknown" +finally: + del version, PackageNotFoundError diff --git a/src/crazyneuraluser/UBAR_code/clean_dataset.py b/src/crazyneuraluser/UBAR_code/clean_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..db57001f7f88c5b05e0926d82b54fed8c3cdf983 --- /dev/null +++ b/src/crazyneuraluser/UBAR_code/clean_dataset.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +import re + +from crazyneuraluser.UBAR_code import ontology + + +def my_clean_text(text): + text = re.sub(r"([a-zT]+)\.([a-z])", r"\1 . \2", text) # 'abc.xyz' -> 'abc . xyz' + text = re.sub(r"(\w+)\.\.? ", r"\1 . ", text) # if 'abc. ' -> 'abc . ' + return text + + +def clean_text(text): + text = text.strip() + text = text.lower() + text = text.replace("’", "'") + text = text.replace("‘", "'") + text = text.replace(";", ",") + text = text.replace('"', " ") + text = text.replace("/", " and ") + text = text.replace("don't", "do n't") + text = clean_time(text) + baddata = { + r"c\.b (\d), (\d) ([a-z])\.([a-z])": r"cb\1\2\3\4", + "c.b. 1 7 d.y": "cb17dy", + "c.b.1 7 d.y": "cb17dy", + "c.b 25, 9 a.q": "cb259aq", + "isc.b 25, 9 a.q": "is cb259aq", + "c.b2, 1 u.f": "cb21uf", + "c.b 1,2 q.a": "cb12qa", + "0-122-336-5664": "01223365664", + "postcodecb21rs": "postcode cb21rs", + r"i\.d": "id", + " i d ": "id", + "Telephone:01223358966": "Telephone: 01223358966", + "depature": "departure", + "depearting": "departing", + "-type": " type", + r"b[\s]?&[\s]?b": "bed and breakfast", + "b and b": "bed and breakfast", + r"guesthouse[s]?": "guest house", + r"swimmingpool[s]?": "swimming pool", + "wo n't": "will not", + " 'd ": " would ", + " 'm ": " am ", + " 're' ": " are ", + " 'll' ": " will ", + " 've ": " have ", + r"^\'": "", + r"\'$": "", + } + for tmpl, good in baddata.items(): + text = re.sub(tmpl, good, text) + + text = re.sub(r"([a-zT]+)\.([a-z])", r"\1 . \2", text) # 'abc.xyz' -> 'abc . xyz' + text = re.sub(r"(\w+)\.\.? ", r"\1 . ", text) # if 'abc. ' -> 'abc . ' + + with open("data/raw/UBAR/multi-woz/mapping.pair", "r") as fin: + for line in fin.readlines(): + fromx, tox = line.replace("\n", "").split("\t") + text = " " + text + " " + text = text.replace(" " + fromx + " ", " " + tox + " ")[1:-1] + + return text + + +def clean_time(utter): + utter = re.sub( + r"(\d+) ([ap]\.?m)", lambda x: x.group(1) + x.group(2), utter + ) # 9 am -> 9am + utter = re.sub(r"((?3"} + else: + nummap = {0: "0", 1: "1-5", 2: "6-10", 3: ">10"} + if vector[:4] == [0, 0, 0, 0]: + report = "" + else: + num = vector.index(1) + report = domain + ": " + nummap[num] + "; " + + if vector[-2] == 0 and vector[-1] == 1: + report += "booking: ok" + if vector[-2] == 1 and vector[-1] == 0: + report += "booking: unable" + + return report + + def queryJsons(self, domain, constraints, exactly_match=True, return_name=False): + """Returns the list of entities for a given domain + based on the annotation of the belief state + constraints: dict e.g. {'pricerange': 'cheap', 'area': 'west'} + """ + # query the db + if domain == "taxi": + return [ + { + "taxi_colors": random.choice(self.dbs[domain][0]["taxi_colors"]), + "taxi_types": random.choice(self.dbs[domain][0]["taxi_types"]), + "taxi_phone": "".join(random.choices(string.digits, k=10)), + } + ] + if domain == "police": + return self.dbs["police"] + if domain == "hospital": + if constraints.get("department"): + for entry in self.dbs["hospital"]: + if entry.get("department") == constraints.get("department"): + return [entry] + else: + # Instead of returning an empty list which breaks lexicalisation, when is no department constraint, + # return the first entry from the hospital db so the user still gets hospital information. + return [self.dbs["hospital"][0]] + + valid_cons = False + for v in constraints.values(): + if v not in ["not mentioned", ""]: + valid_cons = True + if not valid_cons: + return [] + + match_result = [] + + if "name" in constraints: + for db_ent in self.dbs[domain]: + if "name" in db_ent: + cons = constraints["name"] + dbn = db_ent["name"] + if cons == dbn: + db_ent = db_ent if not return_name else db_ent["name"] + match_result.append(db_ent) + return match_result + + for db_ent in self.dbs[domain]: + match = True + for s, v in constraints.items(): + if s == "name": + continue + if ( + s in ["people", "stay"] + or (domain == "hotel" and s == "day") + or (domain == "restaurant" and s in ["day", "time"]) + ): + continue + + skip_case = { + "don't care": 1, + "do n't care": 1, + "dont care": 1, + "not mentioned": 1, + "dontcare": 1, + "": 1, + } + if skip_case.get(v): + continue + + if s not in db_ent: + # logging.warning('Searching warning: slot %s not in %s db'%(s, domain)) + match = False + break + + # v = 'guesthouse' if v == 'guest house' else v + # v = 'swimmingpool' if v == 'swimming pool' else v + v = "yes" if v == "free" else v + + if s in ["arrive", "leave"]: + try: + h, m = v.split(":") # raise error if time value is not xx:xx format + v = int(h) * 60 + int(m) + except Exception: + match = False + break + time = int(db_ent[s].split(":")[0]) * 60 + int(db_ent[s].split(":")[1]) + if s == "arrive" and v > time: + match = False + if s == "leave" and v < time: + match = False + else: + if exactly_match and v != db_ent[s]: + match = False + break + elif v not in db_ent[s]: + match = False + break + + if match: + match_result.append(db_ent) + + if not return_name: + return match_result + else: + if domain == "train": + match_result = [e["id"] for e in match_result] + else: + match_result = [e["name"] for e in match_result] + return match_result + + def querySQL(self, domain, constraints): + if not self.sql_dbs: + for dom in db_domains: + db = "db/{}-dbase.db".format(dom) + conn = sqlite3.connect(db) + c = conn.cursor() + self.sql_dbs[dom] = c + + sql_query = "select * from {}".format(domain) + + flag = True + for key, val in constraints.items(): + if ( + val == "" + or val == "dontcare" + or val == "not mentioned" + or val == "don't care" + or val == "dont care" + or val == "do n't care" + ): + pass + else: + if flag: + sql_query += " where " + val2 = val.replace("'", "''") + # val2 = normalize(val2) + if key == "leaveAt": + sql_query += r" " + key + " > " + r"'" + val2 + r"'" + elif key == "arriveBy": + sql_query += r" " + key + " < " + r"'" + val2 + r"'" + else: + sql_query += r" " + key + "=" + r"'" + val2 + r"'" + flag = False + else: + val2 = val.replace("'", "''") + # val2 = normalize(val2) + if key == "leaveAt": + sql_query += r" and " + key + " > " + r"'" + val2 + r"'" + elif key == "arriveBy": + sql_query += r" and " + key + " < " + r"'" + val2 + r"'" + else: + sql_query += r" and " + key + "=" + r"'" + val2 + r"'" + + try: # "select * from attraction where name = 'queens college'" + print(sql_query) + return self.sql_dbs[domain].execute(sql_query).fetchall() + except Exception: + return [] # TODO test it + + +if __name__ == "__main__": + dbPATHs = { + "attraction": "db/attraction_db_processed.json", + "hospital": "db/hospital_db_processed.json", + "hotel": "db/hotel_db_processed.json", + "police": "db/police_db_processed.json", + "restaurant": "db/restaurant_db_processed.json", + "taxi": "db/taxi_db_processed.json", + "train": "db/train_db_processed.json", + } + db = MultiWozDB(dbPATHs) + while True: + constraints = {} + inp = input("input belief state in fomat: domain-slot1=value1;slot2=value2...\n") + domain, cons = inp.split("-") + for sv in cons.split(";"): + s, v = sv.split("=") + constraints[s] = v + # res = db.querySQL(domain, constraints) + res = db.queryJsons(domain, constraints, return_name=True) + report = [] + reidx = { + "hotel": 8, + "restaurant": 6, + "attraction": 5, + "train": 1, + } + # for ent in res: + # if reidx.get(domain): + # report.append(ent[reidx[domain]]) + # for ent in res: + # if 'name' in ent: + # report.append(ent['name']) + # if 'trainid' in ent: + # report.append(ent['trainid']) + print(constraints) + print(res) + print("count:", len(res), "\nnames:", report) diff --git a/src/crazyneuraluser/UBAR_code/eval.py b/src/crazyneuraluser/UBAR_code/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..fb312d6d3ce5c05eb07a49e3532f5984bceefd02 --- /dev/null +++ b/src/crazyneuraluser/UBAR_code/eval.py @@ -0,0 +1,932 @@ +import copy +import logging +import math +from collections import Counter, OrderedDict + +from nltk.util import ngrams + +from crazyneuraluser.UBAR_code import ontology +from crazyneuraluser.UBAR_code.clean_dataset import clean_slot_values +from crazyneuraluser.UBAR_code.config import global_config as cfg + + +class BLEUScorer(object): + # BLEU score calculator via GentScorer interface + # it calculates the BLEU-4 by taking the entire corpus in + # Calulate based multiple candidates against multiple references + def __init__(self): + pass + + def score(self, parallel_corpus): + + # containers + count = [0, 0, 0, 0] + clip_count = [0, 0, 0, 0] + r = 0 + c = 0 + weights = [0.25, 0.25, 0.25, 0.25] + + # accumulate ngram statistics + for hyps, refs in parallel_corpus: + hyps = [hyp.split() for hyp in hyps] + refs = [ref.split() for ref in refs] + for hyp in hyps: + + for i in range(4): + # accumulate ngram counts + hypcnts = Counter(ngrams(hyp, i + 1)) + cnt = sum(hypcnts.values()) + count[i] += cnt + + # compute clipped counts + max_counts = {} + for ref in refs: + refcnts = Counter(ngrams(ref, i + 1)) + for ng in hypcnts: + max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng]) + clipcnt = dict( + (ng, min(count, max_counts[ng])) + for ng, count in hypcnts.items() + ) + clip_count[i] += sum(clipcnt.values()) + + # accumulate r & c + bestmatch = [1000, 1000] + for ref in refs: + if bestmatch[0] == 0: + break + diff = abs(len(ref) - len(hyp)) + if diff < bestmatch[0]: + bestmatch[0] = diff + bestmatch[1] = len(ref) + r += bestmatch[1] + c += len(hyp) + + # computing bleu score + p0 = 1e-7 + bp = 1 if c > r else math.exp(1 - float(r) / float(c)) + p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)] + s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n) + bleu = bp * math.exp(s) + return bleu * 100 + + +class MultiWozEvaluator(object): + def __init__(self, reader): + self.reader = reader + self.domains = ontology.all_domains + self.domain_files = self.reader.domain_files + self.all_data = self.reader.data + self.test_data = self.reader.test + + self.bleu_scorer = BLEUScorer() + + self.all_info_slot = [] + for d, s_list in ontology.informable_slots.items(): + for s in s_list: + self.all_info_slot.append(d + "-" + s) + + # only evaluate these slots for dialog success + self.requestables = ["phone", "address", "postcode", "reference", "id"] + + def pack_dial(self, data): + dials = {} + for turn in data: + dial_id = turn["dial_id"] + if dial_id not in dials: + dials[dial_id] = [] + dials[dial_id].append(turn) + return dials + + def run_metrics(self, data): + if "all" in cfg.exp_domains: + metric_results = [] + metric_result = self._get_metric_results(data) + metric_results.append(metric_result) + + if cfg.eval_per_domain: + # all domain experiments, sub domain evaluation + domains = [d + "_single" for d in ontology.all_domains] + domains = domains + [ + "restaurant_train", + "restaurant_hotel", + "restaurant_attraction", + "hotel_train", + "hotel_attraction", + "attraction_train", + "restaurant_hotel_taxi", + "restaurant_attraction_taxi", + "hotel_attraction_taxi", + ] + for domain in domains: + file_list = self.domain_files.get(domain, []) + if not file_list: + print("No sub domain [%s]" % domain) + metric_result = self._get_metric_results(data, domain, file_list) + if metric_result: + metric_results.append(metric_result) + + else: + # sub domain experiments + metric_results = [] + for domain, file_list in self.domain_files.items(): + if domain not in cfg.exp_domains: + continue + metric_result = self._get_metric_results(data, domain, file_list) + if metric_result: + metric_results.append(metric_result) + + return metric_results + + def validation_metric(self, data): + bleu = self.bleu_metric(data) + # accu_single_dom, accu_multi_dom, multi_dom_num = self.domain_eval(data) + success, match, req_offer_counts, dial_num = self.context_to_response_eval( + data, same_eval_as_cambridge=cfg.same_eval_as_cambridge + ) + return bleu, success, match + + def _get_metric_results(self, data, domain="all", file_list=None): + metric_result = {"domain": domain} + bleu = self.bleu_metric(data, file_list) + if cfg.bspn_mode == "bspn" or cfg.enable_dst: + ( + jg, + slot_f1, + slot_acc, + slot_cnt, + slot_corr, + ) = self.dialog_state_tracking_eval(data, file_list) + jg_nn, sf1_nn, sac_nn, _, _ = self.dialog_state_tracking_eval( + data, file_list, no_name=True, no_book=False + ) + jg_nb, sf1_nb, sac_nb, _, _ = self.dialog_state_tracking_eval( + data, file_list, no_name=False, no_book=True + ) + jg_nnnb, sf1_nnnb, sac_nnnb, _, _ = self.dialog_state_tracking_eval( + data, file_list, no_name=True, no_book=True + ) + metric_result.update( + {"joint_goal": jg, "slot_acc": slot_acc, "slot_f1": slot_f1} + ) + if cfg.bspn_mode == "bsdx": + ( + jg_, + slot_f1_, + slot_acc_, + slot_cnt, + slot_corr, + ) = self.dialog_state_tracking_eval(data, file_list, bspn_mode="bsdx") + jg_nn_, sf1_nn_, sac_nn_, _, _ = self.dialog_state_tracking_eval( + data, file_list, bspn_mode="bsdx", no_name=True, no_book=False + ) + metric_result.update( + { + "joint_goal_delex": jg_, + "slot_acc_delex": slot_acc_, + "slot_f1_delex": slot_f1_, + } + ) + + info_slots_acc = {} + for slot in slot_cnt: + correct = slot_corr.get(slot, 0) + info_slots_acc[slot] = correct / slot_cnt[slot] * 100 + info_slots_acc = OrderedDict(sorted(info_slots_acc.items(), key=lambda x: x[1])) + + act_f1 = self.aspn_eval(data, file_list) + avg_act_num, avg_diverse_score = self.multi_act_eval(data, file_list) + accu_single_dom, accu_multi_dom, multi_dom_num = self.domain_eval( + data, file_list + ) + + success, match, req_offer_counts, dial_num = self.context_to_response_eval( + data, file_list, same_eval_as_cambridge=cfg.same_eval_as_cambridge + ) + req_slots_acc = {} + for req in self.requestables: + acc = req_offer_counts[req + "_offer"] / ( + req_offer_counts[req + "_total"] + 1e-10 + ) + req_slots_acc[req] = acc * 100 + req_slots_acc = OrderedDict(sorted(req_slots_acc.items(), key=lambda x: x[1])) + + if dial_num: + metric_result.update( + { + "act_f1": act_f1, + "success": success, + "match": match, + "bleu": bleu, + "req_slots_acc": req_slots_acc, + "info_slots_acc": info_slots_acc, + "dial_num": dial_num, + "accu_single_dom": accu_single_dom, + "accu_multi_dom": accu_multi_dom, + "avg_act_num": avg_act_num, + "avg_diverse_score": avg_diverse_score, + } + ) + if domain == "all": + logging.info( + "-------------------------- All DOMAINS --------------------------" + ) + else: + logging.info( + "-------------------------- %s (# %d) -------------------------- " + % (domain.upper(), dial_num) + ) + if cfg.bspn_mode == "bspn" or cfg.enable_dst: + logging.info( + "[DST] joint goal:%2.1f slot acc: %2.1f slot f1: %2.1f act f1: %2.1f" + % (jg, slot_acc, slot_f1, act_f1) + ) + logging.info( + "[DST] [not eval name slots] joint goal:%2.1f slot acc: %2.1f slot f1: %2.1f" + % (jg_nn, sac_nn, sf1_nn) + ) + logging.info( + "[DST] [not eval book slots] joint goal:%2.1f slot acc: %2.1f slot f1: %2.1f" + % (jg_nb, sac_nb, sf1_nb) + ) + logging.info( + "[DST] [not eval name & book slots] joint goal:%2.1f slot acc: %2.1f slot f1: %2.1f" + % (jg_nnnb, sac_nnnb, sf1_nnnb) + ) + if cfg.bspn_mode == "bsdx": + logging.info( + "[BDX] joint goal:%2.1f slot acc: %2.1f slot f1: %2.1f act f1: %2.1f" + % (jg_, slot_acc_, slot_f1_, act_f1) + ) + logging.info( + "[BDX] [not eval name slots] joint goal:%2.1f slot acc: %2.1f slot f1: %2.1f" + % (jg_nn_, sac_nn_, sf1_nn_) + ) + logging.info( + "[CTR] match: %2.1f success: %2.1f bleu: %2.1f" + % (match, success, bleu) + ) + logging.info( + "[CTR] " + + "; ".join( + ["%s: %2.1f" % (req, acc) for req, acc in req_slots_acc.items()] + ) + ) + logging.info( + "[DOM] accuracy: single %2.1f / multi: %2.1f (%d)" + % (accu_single_dom, accu_multi_dom, multi_dom_num) + ) + if self.reader.multi_acts_record is not None: + logging.info( + "[MA] avg acts num %2.1f avg slots num: %2.1f " + % (avg_act_num, avg_diverse_score) + ) + return metric_result + else: + return None + + def bleu_metric(self, data, eval_dial_list=None): + gen, truth = [], [] + for row in data: + if eval_dial_list and row["dial_id"] + ".json" not in eval_dial_list: + continue + gen.append(row["resp_gen"]) + truth.append(row["resp"]) + wrap_generated = [[_] for _ in gen] + wrap_truth = [[_] for _ in truth] + if gen and truth: + sc = self.bleu_scorer.score(zip(wrap_generated, wrap_truth)) + else: + sc = 0.0 + return sc + + def value_similar(self, a, b): + return True if a == b else False + + # the value equal condition used in "Sequicity" is too loose + if ( + a in b + or b in a + or a.split()[0] == b.split()[0] + or a.split()[-1] == b.split()[-1] + ): + return True + return False + + def _bspn_to_dict(self, bspn, no_name=False, no_book=False, bspn_mode="bspn"): + constraint_dict = self.reader.bspan_to_constraint_dict( + bspn, bspn_mode=bspn_mode + ) + constraint_dict_flat = {} + for domain, cons in constraint_dict.items(): + for s, v in cons.items(): + key = domain + "-" + s + if no_name and s == "name": + continue + if no_book: + if s in ["people", "stay"] or key in [ + "hotel-day", + "restaurant-day", + "restaurant-time", + ]: + continue + constraint_dict_flat[key] = v + return constraint_dict_flat + + def _constraint_compare( + self, truth_cons, gen_cons, slot_appear_num=None, slot_correct_num=None + ): + tp, fp, fn = 0, 0, 0 + false_slot = [] + for slot in gen_cons: + v_gen = gen_cons[slot] + if slot in truth_cons and self.value_similar( + v_gen, truth_cons[slot] + ): # v_truth = truth_cons[slot] + tp += 1 + if slot_correct_num is not None: + slot_correct_num[slot] = ( + 1 + if not slot_correct_num.get(slot) + else slot_correct_num.get(slot) + 1 + ) + else: + fp += 1 + false_slot.append(slot) + for slot in truth_cons: + v_truth = truth_cons[slot] + if slot_appear_num is not None: + slot_appear_num[slot] = ( + 1 + if not slot_appear_num.get(slot) + else slot_appear_num.get(slot) + 1 + ) + if slot not in gen_cons or not self.value_similar(v_truth, gen_cons[slot]): + fn += 1 + false_slot.append(slot) + acc = len(self.all_info_slot) - fp - fn + return tp, fp, fn, acc, list(set(false_slot)) + + def domain_eval(self, data, eval_dial_list=None): + dials = self.pack_dial(data) + corr_single, total_single, corr_multi, total_multi = 0, 0, 0, 0 + + dial_num = 0 + for dial_id in dials: + if eval_dial_list and dial_id + ".json" not in eval_dial_list: + continue + dial_num += 1 + dial = dials[dial_id] + wrong_pred = [] + + prev_constraint_dict = {} + prev_turn_domain = ["general"] + + for turn_num, turn in enumerate(dial): + if turn_num == 0: + continue + true_domains = self.reader.dspan_to_domain(turn["dspn"]) + if cfg.enable_dspn: + pred_domains = self.reader.dspan_to_domain(turn["dspn_gen"]) + else: + turn_dom_bs = [] + if ( + cfg.enable_bspn + and not cfg.use_true_bspn_for_ctr_eval + and (cfg.bspn_mode == "bspn" or cfg.enable_dst) + ): + constraint_dict = self.reader.bspan_to_constraint_dict( + turn["bspn_gen"] + ) + else: + constraint_dict = self.reader.bspan_to_constraint_dict( + turn["bspn"] + ) + for domain in constraint_dict: + if domain not in prev_constraint_dict: + turn_dom_bs.append(domain) + elif prev_constraint_dict[domain] != constraint_dict[domain]: + turn_dom_bs.append(domain) + aspn = "aspn" if not cfg.enable_aspn else "aspn_gen" + turn_dom_da = [] + for a in turn[aspn].split(): + if a[1:-1] in ontology.all_domains + ["general"]: + turn_dom_da.append(a[1:-1]) + + # get turn domain + turn_domain = turn_dom_bs + for dom in turn_dom_da: + if dom != "booking" and dom not in turn_domain: + turn_domain.append(dom) + if not turn_domain: + turn_domain = prev_turn_domain + if len(turn_domain) == 2 and "general" in turn_domain: + turn_domain.remove("general") + if len(turn_domain) == 2: + if ( + len(prev_turn_domain) == 1 + and prev_turn_domain[0] == turn_domain[1] + ): + turn_domain = turn_domain[::-1] + prev_turn_domain = copy.deepcopy(turn_domain) + prev_constraint_dict = copy.deepcopy(constraint_dict) + + turn["dspn_gen"] = " ".join(["[" + d + "]" for d in turn_domain]) + pred_domains = {} + for d in turn_domain: + pred_domains["[" + d + "]"] = 1 + + if len(true_domains) == 1: + total_single += 1 + if pred_domains == true_domains: + corr_single += 1 + else: + wrong_pred.append(str(turn["turn_num"])) + turn["wrong_domain"] = "x" + else: + total_multi += 1 + if pred_domains == true_domains: + corr_multi += 1 + else: + wrong_pred.append(str(turn["turn_num"])) + turn["wrong_domain"] = "x" + + # dialog inform metric record + dial[0]["wrong_domain"] = " ".join(wrong_pred) + accu_single = corr_single / (total_single + 1e-10) + accu_multi = corr_multi / (total_multi + 1e-10) + return accu_single * 100, accu_multi * 100, total_multi + + def dialog_state_tracking_eval( + self, data, eval_dial_list=None, bspn_mode="bspn", no_name=False, no_book=False + ): + dials = self.pack_dial(data) + total_turn, joint_match, total_tp, total_fp, total_fn, total_acc = ( + 0, + 0, + 0, + 0, + 0, + 0, + ) + slot_appear_num, slot_correct_num = {}, {} + dial_num = 0 + for dial_id in dials: + if eval_dial_list and dial_id + ".json" not in eval_dial_list: + continue + dial_num += 1 + dial = dials[dial_id] + missed_jg_turn_id = [] + for turn_num, turn in enumerate(dial): + if turn_num == 0: + continue + gen_cons = self._bspn_to_dict( + turn[bspn_mode + "_gen"], + no_name=no_name, + no_book=no_book, + bspn_mode=bspn_mode, + ) + truth_cons = self._bspn_to_dict( + turn[bspn_mode], + no_name=no_name, + no_book=no_book, + bspn_mode=bspn_mode, + ) + + if truth_cons == gen_cons: + joint_match += 1 + else: + missed_jg_turn_id.append(str(turn["turn_num"])) + + if eval_dial_list is None: + tp, fp, fn, acc, false_slots = self._constraint_compare( + truth_cons, gen_cons, slot_appear_num, slot_correct_num + ) + else: + tp, fp, fn, acc, false_slots = self._constraint_compare( + truth_cons, + gen_cons, + ) + + total_tp += tp + total_fp += fp + total_fn += fn + total_acc += acc + total_turn += 1 + if not no_name and not no_book: + turn["wrong_inform"] = "; ".join( + false_slots + ) # turn inform metric record + + # dialog inform metric record + if not no_name and not no_book: + dial[0]["wrong_inform"] = " ".join(missed_jg_turn_id) + + precision = total_tp / (total_tp + total_fp + 1e-10) + recall = total_tp / (total_tp + total_fn + 1e-10) + f1 = 2 * precision * recall / (precision + recall + 1e-10) * 100 + accuracy = total_acc / (total_turn * len(self.all_info_slot) + 1e-10) * 100 + joint_goal = joint_match / (total_turn + 1e-10) * 100 + + return joint_goal, f1, accuracy, slot_appear_num, slot_correct_num + + def aspn_eval(self, data, eval_dial_list=None): + def _get_tp_fp_fn(label_list, pred_list): + tp = len([t for t in pred_list if t in label_list]) + fp = max(0, len(pred_list) - tp) + fn = max(0, len(label_list) - tp) + return tp, fp, fn + + dials = self.pack_dial(data) + total_tp, total_fp, total_fn = 0, 0, 0 + + dial_num = 0 + for dial_id in dials: + if eval_dial_list and dial_id + ".json" not in eval_dial_list: + continue + dial_num += 1 + dial = dials[dial_id] + wrong_act = [] + for turn_num, turn in enumerate(dial): + if turn_num == 0: + continue + if cfg.same_eval_act_f1_as_hdsa: + pred_acts, true_acts = {}, {} + for t in turn["aspn_gen"]: + pred_acts[t] = 1 + for t in turn["aspn"]: + true_acts[t] = 1 + tp, fp, fn = _get_tp_fp_fn(true_acts, pred_acts) + else: + pred_acts = self.reader.aspan_to_act_list(turn["aspn_gen"]) + true_acts = self.reader.aspan_to_act_list(turn["aspn"]) + tp, fp, fn = _get_tp_fp_fn(true_acts, pred_acts) + if fp + fn != 0: + wrong_act.append(str(turn["turn_num"])) + turn["wrong_act"] = "x" + + total_tp += tp + total_fp += fp + total_fn += fn + + dial[0]["wrong_act"] = " ".join(wrong_act) + precision = total_tp / (total_tp + total_fp + 1e-10) + recall = total_tp / (total_tp + total_fn + 1e-10) + f1 = 2 * precision * recall / (precision + recall + 1e-10) + + return f1 * 100 + + def multi_act_eval(self, data, eval_dial_list=None): + + dials = self.pack_dial(data) + total_act_num, total_slot_num = 0, 0 + + dial_num = 0 + turn_count = 0 + for dial_id in dials: + if eval_dial_list and dial_id + ".json" not in eval_dial_list: + continue + dial_num += 1 + dial = dials[dial_id] + for turn_num, turn in enumerate(dial): + if turn_num == 0: + continue + target = ( + turn["multi_act_gen"] + if self.reader.multi_acts_record is not None + else turn["aspn_gen"] + ) + + # diversity + act_collect, slot_collect = {}, {} + act_type_collect = {} + slot_score = 0 + for act_str in target.split(" | "): + pred_acts = self.reader.aspan_to_act_list(act_str) + act_type = "" + for act in pred_acts: + d, a, s = act.split("-") + if d + "-" + a not in act_collect: + act_collect[d + "-" + a] = {s: 1} + slot_score += 1 + act_type += d + "-" + a + ";" + elif s not in act_collect: + act_collect[d + "-" + a][s] = 1 + slot_score += 1 + slot_collect[s] = 1 + act_type_collect[act_type] = 1 + total_act_num += len(act_collect) + total_slot_num += len(slot_collect) + turn_count += 1 + + total_act_num = total_act_num / (float(turn_count) + 1e-10) + total_slot_num = total_slot_num / (float(turn_count) + 1e-10) + return total_act_num, total_slot_num + + def context_to_response_eval( + self, data, eval_dial_list=None, same_eval_as_cambridge=False + ): + dials = self.pack_dial(data) + counts = {} + for req in self.requestables: + counts[req + "_total"] = 0 + counts[req + "_offer"] = 0 + + dial_num, successes, matches = 0, 0, 0 + + for dial_id in dials: + if eval_dial_list and dial_id + ".json" not in eval_dial_list: + continue + dial = dials[dial_id] + reqs = {} + goal = {} + if ".json" not in dial_id and ".json" in list(self.all_data.keys())[0]: + dial_id = dial_id + ".json" + for domain in ontology.all_domains: + if self.all_data[dial_id]["goal"].get(domain): + true_goal = self.all_data[dial_id]["goal"] + goal = self._parseGoal(goal, true_goal, domain) + # print(goal) + for domain in goal.keys(): + reqs[domain] = goal[domain]["requestable"] + + # print('\n',dial_id) + success, match, stats, counts = self._evaluateGeneratedDialogue( + dial, goal, reqs, counts, same_eval_as_cambridge=same_eval_as_cambridge + ) + + successes += success + matches += match + dial_num += 1 + + # for domain in gen_stats.keys(): + # gen_stats[domain][0] += stats[domain][0] + # gen_stats[domain][1] += stats[domain][1] + # gen_stats[domain][2] += stats[domain][2] + + # if 'SNG' in filename: + # for domain in gen_stats.keys(): + # sng_gen_stats[domain][0] += stats[domain][0] + # sng_gen_stats[domain][1] += stats[domain][1] + # sng_gen_stats[domain][2] += stats[domain][2] + + # self.logger.info(report) + succ_rate = successes / (float(dial_num) + 1e-10) * 100 + match_rate = matches / (float(dial_num) + 1e-10) * 100 + return succ_rate, match_rate, counts, dial_num + + def _evaluateGeneratedDialogue( + self, + dialog, + goal, + real_requestables, + counts, + soft_acc=False, + same_eval_as_cambridge=False, + ): + """Evaluates the dialogue created by the model. + First we load the user goal of the dialogue, then for each turn + generated by the system we look for key-words. + For the Inform rate we look whether the entity was proposed. + For the Success rate we look for requestables slots""" + # for computing corpus success 'id' + requestables = self.requestables + + # CHECK IF MATCH HAPPENED + provided_requestables = {} + venue_offered = {} + domains_in_goal = [] + bspans = {} + + for domain in goal.keys(): + venue_offered[domain] = [] + provided_requestables[domain] = [] + domains_in_goal.append(domain) + + for t, turn in enumerate(dialog): + if t == 0: + continue + sent_t = turn["resp_gen"] + # sent_t = turn['resp'] + for domain in goal.keys(): + # for computing success + if same_eval_as_cambridge: + # [restaurant_name], [hotel_name] instead of [value_name] + if cfg.use_true_domain_for_ctr_eval: + dom_pred = [d[1:-1] for d in turn["dspn"].split()] + else: + dom_pred = [d[1:-1] for d in turn["dspn_gen"].split()] + # else: + # raise NotImplementedError('Just use true domain label') + if domain not in dom_pred: # fail + continue + if "[value_name]" in sent_t or "[value_id]" in sent_t: + if domain in ["restaurant", "hotel", "attraction", "train"]: + # HERE YOU CAN PUT YOUR BELIEF STATE ESTIMATION + if ( + not cfg.use_true_curr_bspn + and not cfg.use_true_bspn_for_ctr_eval + ): + bspn = turn["bspn_gen"] + else: + bspn = turn["bspn"] + # bspn = turn['bspn'] + + constraint_dict = self.reader.bspan_to_constraint_dict(bspn) + if constraint_dict.get(domain): + venues = self.reader.db.queryJsons( + domain, constraint_dict[domain], return_name=True + ) + else: + venues = [] + + # if venue has changed + if len(venue_offered[domain]) == 0 and venues: + # venue_offered[domain] = random.sample(venues, 1) + venue_offered[domain] = venues + bspans[domain] = constraint_dict[domain] + else: + # flag = False + # for ven in venues: + # if venue_offered[domain][0] == ven: + # flag = True + # break + # if not flag and venues: + flag = False + for ven in venues: + if ven not in venue_offered[domain]: + # if ven not in venue_offered[domain]: + flag = True + break + # if flag and venues: + if ( + flag and venues + ): # sometimes there are no results so sample won't work + # print venues + # venue_offered[domain] = random.sample(venues, 1) + venue_offered[domain] = venues + bspans[domain] = constraint_dict[domain] + else: # not limited so we can provide one + venue_offered[domain] = "[value_name]" + + # ATTENTION: assumption here - we didn't provide phone or address twice! etc + for requestable in requestables: + if requestable == "reference": + if "[value_reference]" in sent_t: + if ( + "booked" in turn["pointer"] or "ok" in turn["pointer"] + ): # if pointer was allowing for that? + provided_requestables[domain].append("reference") + # provided_requestables[domain].append('reference') + else: + if "[value_" + requestable + "]" in sent_t: + provided_requestables[domain].append(requestable) + + # if name was given in the task + for domain in goal.keys(): + # if name was provided for the user, the match is being done automatically + if "name" in goal[domain]["informable"]: + venue_offered[domain] = "[value_name]" + + # special domains - entity does not need to be provided + if domain in ["taxi", "police", "hospital"]: + venue_offered[domain] = "[value_name]" + + if domain == "train": + if ( + not venue_offered[domain] + and "id" not in goal[domain]["requestable"] + ): + venue_offered[domain] = "[value_name]" + + """ + Given all inform and requestable slots + we go through each domain from the user goal + and check whether right entity was provided and + all requestable slots were given to the user. + The dialogue is successful if that's the case for all domains. + """ + # HARD EVAL + stats = { + "restaurant": [0, 0, 0], + "hotel": [0, 0, 0], + "attraction": [0, 0, 0], + "train": [0, 0, 0], + "taxi": [0, 0, 0], + "hospital": [0, 0, 0], + "police": [0, 0, 0], + } + + match = 0 + success = 0 + # MATCH + for domain in goal.keys(): + match_stat = 0 + if domain in ["restaurant", "hotel", "attraction", "train"]: + goal_venues = self.reader.db.queryJsons( + domain, goal[domain]["informable"], return_name=True + ) + if ( + type(venue_offered[domain]) is str + and "_name" in venue_offered[domain] + ): + match += 1 + match_stat = 1 + elif ( + len(venue_offered[domain]) > 0 + and len(set(venue_offered[domain]) & set(goal_venues)) > 0 + ): + match += 1 + match_stat = 1 + else: + if "_name]" in venue_offered[domain]: + match += 1 + match_stat = 1 + + stats[domain][0] = match_stat + stats[domain][2] = 1 + + if soft_acc: + match = float(match) / len(goal.keys()) + else: + if match == len(goal.keys()): + match = 1.0 + else: + match = 0.0 + + for domain in domains_in_goal: + for request in real_requestables[domain]: + counts[request + "_total"] += 1 + if request in provided_requestables[domain]: + counts[request + "_offer"] += 1 + + # SUCCESS + if match == 1.0: + for domain in domains_in_goal: + success_stat = 0 + domain_success = 0 + if len(real_requestables[domain]) == 0: + success += 1 + success_stat = 1 + stats[domain][1] = success_stat + continue + # if values in sentences are super set of requestables + # for request in set(provided_requestables[domain]): + # if request in real_requestables[domain]: + # domain_success += 1 + for request in real_requestables[domain]: + if request in provided_requestables[domain]: + domain_success += 1 + + # if domain_success >= len(real_requestables[domain]): + if domain_success == len(real_requestables[domain]): + success += 1 + success_stat = 1 + + stats[domain][1] = success_stat + + # final eval + if soft_acc: + success = float(success) / len(real_requestables) + else: + if success >= len(real_requestables): + success = 1 + else: + success = 0 + + return success, match, stats, counts + + def _parseGoal(self, goal, true_goal, domain): + """Parses user goal into dictionary format.""" + goal[domain] = {} + goal[domain] = {"informable": {}, "requestable": [], "booking": []} + if "info" in true_goal[domain]: + if domain == "train": + # we consider dialogues only where train had to be booked! + if "book" in true_goal[domain]: + goal[domain]["requestable"].append("reference") + if "reqt" in true_goal[domain]: + if "id" in true_goal[domain]["reqt"]: + goal[domain]["requestable"].append("id") + else: + if "reqt" in true_goal[domain]: + for s in true_goal[domain]["reqt"]: # addtional requests: + if s in ["phone", "address", "postcode", "reference", "id"]: + # ones that can be easily delexicalized + goal[domain]["requestable"].append(s) + if "book" in true_goal[domain]: + goal[domain]["requestable"].append("reference") + + for s, v in true_goal[domain]["info"].items(): + s_, v_ = clean_slot_values(domain, s, v) + if len(v_.split()) > 1: + v_ = " ".join([token.text for token in self.reader.nlp(v_)]).strip() + goal[domain]["informable"][s_] = v_ + + if "book" in true_goal[domain]: + goal[domain]["booking"] = true_goal[domain]["book"] + return goal + + +if __name__ == "__main__": + pass diff --git a/src/crazyneuraluser/UBAR_code/ontology.py b/src/crazyneuraluser/UBAR_code/ontology.py new file mode 100644 index 0000000000000000000000000000000000000000..d391b98ce69c453bf360bb6810f461e7a36fc0d8 --- /dev/null +++ b/src/crazyneuraluser/UBAR_code/ontology.py @@ -0,0 +1,328 @@ +all_domains = [ + "restaurant", + "hotel", + "attraction", + "train", + "taxi", + "police", + "hospital", +] +db_domains = ["restaurant", "hotel", "attraction", "train"] + +# original slot names in goals (including booking slots) +# requestable_slots_in_goals = { +# "taxi": ["car type", "phone"], +# "police": ["postcode", "address", "phone"], +# "hospital": ["address", "phone", "postcode"], +# "hotel": ["address", "postcode", "internet", "phone", "parking", +# "type", "pricerange", "stars", "area", "reference"], +# "attraction": ["entrance fee", "type", "address", "postcode", "phone", "area", "reference"], +# "train": ["duration", "leaveat", "price", "arriveby", "id", "reference"], +# "restaurant": ["phone", "postcode", "address", "pricerange", "food", "area", "reference"] +# } + +# informable_slots_in_goals = { +# "taxi": ["leaveat", "destination", "departure", "arriveby"], +# "police": [], +# "hospital": ["department"], +# "hotel": ["type", "parking", "pricerange", "internet", "stay", "day", "people", "area", "stars", "name"], +# "attraction": ["area", "type", "name"], +# "train": ["destination", "day", "arriveby", "departure", "people", "leaveat"], +# "restaurant": ["food", "pricerange", "area", "name", "time", "day", "people"] +# } + +normlize_slot_names = { + "car type": "car", + "entrance fee": "price", + "duration": "time", + "leaveat": "leave", + "arriveby": "arrive", + "trainid": "id", +} + +requestable_slots = { + "taxi": ["car", "phone"], + "police": ["postcode", "address", "phone"], + "hospital": ["address", "phone", "postcode"], + "hotel": [ + "address", + "postcode", + "internet", + "phone", + "parking", + "type", + "pricerange", + "stars", + "area", + "reference", + ], + "attraction": [ + "price", + "type", + "address", + "postcode", + "phone", + "area", + "reference", + ], + "train": ["time", "leave", "price", "arrive", "id", "reference"], + "restaurant": [ + "phone", + "postcode", + "address", + "pricerange", + "food", + "area", + "reference", + ], +} +all_reqslot = [ + "car", + "address", + "postcode", + "phone", + "internet", + "parking", + "type", + "pricerange", + "food", + "stars", + "area", + "reference", + "time", + "leave", + "price", + "arrive", + "id", +] +# count: 17 + +informable_slots = { + "taxi": ["leave", "destination", "departure", "arrive"], + "police": [], + "hospital": ["department"], + "hotel": [ + "type", + "parking", + "pricerange", + "internet", + "stay", + "day", + "people", + "area", + "stars", + "name", + ], + "attraction": ["area", "type", "name"], + "train": ["destination", "day", "arrive", "departure", "people", "leave"], + "restaurant": ["food", "pricerange", "area", "name", "time", "day", "people"], +} +all_infslot = [ + "type", + "parking", + "pricerange", + "internet", + "stay", + "day", + "people", + "area", + "stars", + "name", + "leave", + "destination", + "departure", + "arrive", + "department", + "food", + "time", +] +# count: 17 + +all_slots = all_reqslot + [ + "stay", + "day", + "people", + "name", + "destination", + "departure", + "department", +] +get_slot = {} +for s in all_slots: + get_slot[s] = 1 +# count: 24 + + +# mapping slots in dialogue act to original goal slot names +da_abbr_to_slot_name = { + "addr": "address", + "fee": "price", + "post": "postcode", + "ref": "reference", + "ticket": "price", + "depart": "departure", + "dest": "destination", +} + +# slot merging: not used currently +# slot_name_to_value_token = { +# 'entrance fee': 'price', +# 'pricerange': 'price', +# 'arrive': 'time', +# 'leave': 'time', +# 'departure': 'name', +# 'destination': 'name', +# 'stay': 'count', +# 'people': 'count', +# 'stars': 'count', +# } +# dialog_act_dom = ['restaurant', 'hotel', 'attraction', 'train', 'taxi', 'police', 'hospital', 'general', 'booking'] +dialog_acts = { + "restaurant": [ + "inform", + "request", + "nooffer", + "recommend", + "select", + "offerbook", + "offerbooked", + "nobook", + ], + "hotel": [ + "inform", + "request", + "nooffer", + "recommend", + "select", + "offerbook", + "offerbooked", + "nobook", + ], + "attraction": ["inform", "request", "nooffer", "recommend", "select"], + "train": ["inform", "request", "nooffer", "offerbook", "offerbooked", "select"], + "taxi": ["inform", "request"], + "police": ["inform", "request"], + "hospital": ["inform", "request"], + # 'booking': ['book', 'inform', 'nobook', 'request'], + "general": ["bye", "greet", "reqmore", "welcome"], +} +all_acts = [] +for acts in dialog_acts.values(): + for act in acts: + if act not in all_acts: + all_acts.append(act) +# print(all_acts) + +dialog_act_params = { + "inform": all_slots + ["choice", "open"], + "request": all_infslot + ["choice", "price"], + "nooffer": all_slots + ["choice"], + "recommend": all_reqslot + ["choice", "open"], + "select": all_slots + ["choice"], + # 'book': ['time', 'people', 'stay', 'reference', 'day', 'name', 'choice'], + "nobook": ["time", "people", "stay", "reference", "day", "name", "choice"], + "offerbook": all_slots + ["choice"], + "offerbooked": all_slots + ["choice"], + "reqmore": [], + "welcome": [], + "bye": [], + "greet": [], +} + +# dialog_acts = ['inform', 'request', 'nooffer', 'recommend', 'select', 'book', 'nobook', 'offerbook', 'offerbooked', +# 'reqmore', 'welcome', 'bye', 'greet'] # thank +dialog_act_all_slots = all_slots + ["choice", "open"] +# act_span_vocab = ['['+i+']' for i in dialog_act_dom] + ['['+i+']' for i in dialog_acts] + all_slots + +# value_token_in_resp = ['address', 'name', 'phone', 'postcode', 'area', 'food', 'pricerange', 'id', +# 'department', 'place', 'day', 'count', 'car'] +# count: 12 + + +# special slot tokens in belief span +# no need of this, just covert slot to [slot] e.g. pricerange -> [pricerange] +slot_name_to_slot_token = {} + + +# special slot tokens in responses +# not use at the momoent +slot_name_to_value_token = { + # 'entrance fee': '[value_price]', + # 'pricerange': '[value_price]', + # 'arriveby': '[value_time]', + # 'leaveat': '[value_time]', + # 'departure': '[value_place]', + # 'destination': '[value_place]', + # 'stay': 'count', + # 'people': 'count' +} + + +db_tokens = [ + "", + "", + "[db_nores]", + "[db_0]", + "[db_1]", + "[db_2]", + "[db_3]", +] + +special_tokens = [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", +] + db_tokens + +eos_tokens = { + "user": "", + "user_delex": "", + "resp": "", + "resp_gen": "", + "pv_resp": "", + "bspn": "", + "bspn_gen": "", + "pv_bspn": "", + "bsdx": "", + "bsdx_gen": "", + "pv_bsdx": "", + "aspn": "", + "aspn_gen": "", + "pv_aspn": "", + "dspn": "", + "dspn_gen": "", + "pv_dspn": "", +} + +sos_tokens = { + "user": "", + "user_delex": "", + "resp": "", + "resp_gen": "", + "pv_resp": "", + "bspn": "", + "bspn_gen": "", + "pv_bspn": "", + "bsdx": "", + "bsdx_gen": "", + "pv_bsdx": "", + "aspn": "", + "aspn_gen": "", + "pv_aspn": "", + "dspn": "", + "dspn_gen": "", + "pv_dspn": "", +} diff --git a/src/crazyneuraluser/UBAR_code/reader.py b/src/crazyneuraluser/UBAR_code/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..fc33c244007a91383b22276659db3d47766e1ee7 --- /dev/null +++ b/src/crazyneuraluser/UBAR_code/reader.py @@ -0,0 +1,1262 @@ +import csv +import json +import logging +import os +import random +from collections import OrderedDict +from copy import deepcopy + +import numpy as np +import spacy +from transformers import GPT2Tokenizer + +from crazyneuraluser.UBAR_code import ontology, utils +from crazyneuraluser.UBAR_code.config import global_config as cfg +from crazyneuraluser.UBAR_code.db_ops import MultiWozDB + +# from config21 import global_config as cfg + + +class _ReaderBase(object): + def __init__(self): + self.train, self.dev, self.test = [], [], [] + self.vocab = None + self.db = None + self.set_stats = {} + + def _bucket_by_turn(self, encoded_data): + turn_bucket = {} + for dial in encoded_data: + turn_len = len(dial) + if turn_len not in turn_bucket: + turn_bucket[turn_len] = [] + turn_bucket[turn_len].append(dial) + del_l = [] + for k in turn_bucket: + if k >= 5: + del_l.append(k) + logging.debug("bucket %d instance %d" % (k, len(turn_bucket[k]))) + # for k in del_l: + # turn_bucket.pop(k) + return OrderedDict(sorted(turn_bucket.items(), key=lambda i: i[0])) + + def _construct_mini_batch(self, data): + all_batches = [] + batch = [] + for dial in data: + batch.append(dial) + if len(batch) == cfg.batch_size: + # print('batch size: %d, batch num +1'%(len(batch))) + all_batches.append(batch) + batch = [] + # if remainder > 1/2 batch_size, just put them in the previous batch, otherwise form a new batch + # print('last batch size: %d, batch num +1'%(len(batch))) + if (len(batch) % len(cfg.cuda_device)) != 0: + batch = batch[: -(len(batch) % len(cfg.cuda_device))] + if len(batch) > 0.5 * cfg.batch_size: + all_batches.append(batch) + elif len(all_batches): + all_batches[-1].extend(batch) + else: + all_batches.append(batch) + return all_batches + + def transpose_batch(self, batch): + dial_batch = [] + turn_num = len(batch[0]) + for turn in range(turn_num): + turn_l = {} + for dial in batch: + this_turn = dial[turn] + for k in this_turn: + if k not in turn_l: + turn_l[k] = [] + turn_l[k].append(this_turn[k]) + dial_batch.append(turn_l) + return dial_batch + + def inverse_transpose_turn(self, turn_list): + """ + eval, one dialog at a time + """ + dialogs = {} + turn_num = len(turn_list) + dial_id = turn_list[0]["dial_id"] + dialogs[dial_id] = [] + for turn_idx in range(turn_num): + dial_turn = {} + turn = turn_list[turn_idx] + for key, value in turn.items(): + if key == "dial_id": + continue + if key == "pointer" and self.db is not None: + turn_domain = turn["turn_domain"][-1] + value = self.db.pointerBack(value, turn_domain) + dial_turn[key] = value + dialogs[dial_id].append(dial_turn) + return dialogs + + def inverse_transpose_batch(self, turn_batch_list): + """ + :param turn_batch_list: list of transpose dial batch + """ + dialogs = {} + total_turn_num = len(turn_batch_list) + # initialize + for idx_in_batch, dial_id in enumerate(turn_batch_list[0]["dial_id"]): + dialogs[dial_id] = [] + for turn_n in range(total_turn_num): + dial_turn = {} + turn_batch = turn_batch_list[turn_n] + for key, v_list in turn_batch.items(): + if key == "dial_id": + continue + value = v_list[idx_in_batch] + if key == "pointer" and self.db is not None: + turn_domain = turn_batch["turn_domain"][idx_in_batch][-1] + value = self.db.pointerBack(value, turn_domain) + dial_turn[key] = value + dialogs[dial_id].append(dial_turn) + return dialogs + + def get_eval_data(self, set_name="dev"): + name_to_set = {"train": self.train, "test": self.test, "dev": self.dev} + dial = name_to_set[set_name] + + if set_name not in self.set_stats: + self.set_stats[set_name] = {} + num_turns = 0 + num_dials = len(dial) + for d in dial: + num_turns += len(d) + + self.set_stats[set_name]["num_turns"] = num_turns + self.set_stats[set_name]["num_dials"] = num_dials + + return dial + + def get_batches(self, set_name): + """ + compute dataset stats. + """ + global dia_count + log_str = "" + name_to_set = {"train": self.train, "test": self.test, "dev": self.dev} + dial = name_to_set[set_name] + if cfg.low_resource and set_name == "train": + # dial = random.sample(dial, int(len(dial)*0.01)) + dial = random.sample(dial, 100) + logging.info("Low Resource setting, finetuning size: {}".format(len(dial))) + turn_bucket = self._bucket_by_turn(dial) + # self._shuffle_turn_bucket(turn_bucket) + all_batches = [] + + if set_name not in self.set_stats: + self.set_stats[set_name] = {} + num_training_steps = 0 + num_turns = 0 + num_dials = 0 + + for k in turn_bucket: + if set_name != "test" and k == 1 or k >= 17: + continue + batches = self._construct_mini_batch(turn_bucket[k]) + log_str += "turn num:%d, dial num: %d, batch num: %d last batch len: %d\n" % ( + k, + len(turn_bucket[k]), + len(batches), + len(batches[-1]), + ) + # print("turn num:%d, dial num:v%d, batch num: %d, "%(k, len(turn_bucket[k]), len(batches))) + num_training_steps += k * len(batches) + num_turns += k * len(turn_bucket[k]) + num_dials += len(turn_bucket[k]) + all_batches += batches + log_str += "total batch num: %d\n" % len(all_batches) + # print('total batch num: %d'%len(all_batches)) + # print('dialog count: %d'%dia_count) + # return all_batches + + # log stats + # logging.info(log_str) + # cfg.num_training_steps = num_training_steps * cfg.epoch_num + self.set_stats[set_name]["num_training_steps_per_epoch"] = num_training_steps + self.set_stats[set_name]["num_turns"] = num_turns + self.set_stats[set_name]["num_dials"] = num_dials + + if set_name == "train": + random.shuffle(all_batches) + return all_batches + + def get_nontranspose_data_iterator(self, all_batches): + for i, batch in enumerate(all_batches): + yield batch + + def get_data_iterator(self, all_batches): + for i, batch in enumerate(all_batches): + yield self.transpose_batch(batch) + + def save_result(self, write_mode, results, field, write_title=False): + with open(cfg.result_path, write_mode) as rf: + if write_title: + rf.write(write_title + "\n") + writer = csv.DictWriter(rf, fieldnames=field) + writer.writeheader() + writer.writerows(results) + return None + + def save_result_report(self, results): + # if 'joint_goal' in results[0]: + # with open(cfg.result_path[:-4] + '_report_dst.txt', 'w') as rf: + # rf.write('joint goal\tslot_acc\tslot_f1\tact_f1\n') + # for res in results: + # a,b,c,d = res['joint_goal'], res['slot_acc'], res['slot_f1'], res['act_f1'] + # rf.write('%2.1f\t%2.1f\t%2.1f\t%2.1f\n'%(a,b,c,d)) + # elif 'joint_goal_delex' in results[0]: + # with open(cfg.result_path[:-4] + '_report_bsdx.txt', 'w') as rf: + # rf.write('joint goal\tslot_acc\tslot_f1\tact_f1\n') + # for res in results: + # a,b,c,d = res['joint_goal_delex'], res['slot_acc_delex'], res['slot_f1_delex'], res['act_f1'] + # rf.write('%2.1f\t%2.1f\t%2.1f\t%2.1f\n'%(a,b,c,d)) + ctr_save_path = cfg.result_path[:-4] + "_report_ctr%s.csv" % cfg.seed + write_title = False if os.path.exists(ctr_save_path) else True + if cfg.aspn_decode_mode == "greedy": + setting = "" + elif cfg.aspn_decode_mode == "beam": + setting = "width=%s" % str(cfg.beam_width) + if cfg.beam_diverse_param > 0: + setting += ", penalty=%s" % str(cfg.beam_diverse_param) + elif cfg.aspn_decode_mode == "topk_sampling": + setting = "topk=%s" % str(cfg.topk_num) + elif cfg.aspn_decode_mode == "nucleur_sampling": + setting = "p=%s" % str(cfg.nucleur_p) + res = { + "exp": cfg.eval_load_path, + "true_bspn": cfg.use_true_curr_bspn, + "true_aspn": cfg.use_true_curr_aspn, + "decode": cfg.aspn_decode_mode, + "param": setting, + "nbest": cfg.nbest, + "selection_sheme": cfg.act_selection_scheme, + "match": results[0]["match"], + "success": results[0]["success"], + "bleu": results[0]["bleu"], + "act_f1": results[0]["act_f1"], + "avg_act_num": results[0]["avg_act_num"], + "avg_diverse": results[0]["avg_diverse_score"], + } + with open(ctr_save_path, "a") as rf: + writer = csv.DictWriter(rf, fieldnames=list(res.keys())) + if write_title: + writer.writeheader() + writer.writerows([res]) + + +class MultiWozReader(_ReaderBase): + def __init__(self, tokenizer): + super().__init__() + self.nlp = spacy.load("en_core_web_sm") + + self.db = MultiWozDB(cfg.dbs) + self.vocab_size = self._build_vocab() + + # self.tokenizer = GPT2Tokenizer.from_pretrained(cfg.gpt_path) # add special tokens later + self.tokenizer = tokenizer + if cfg.mode == "train": + self.add_sepcial_tokens() + + self.domain_files = json.loads(open(cfg.domain_file_path, "r").read()) + self.slot_value_set = json.loads(open(cfg.slot_value_set_path, "r").read()) + if cfg.multi_acts_training: + self.multi_acts = json.loads(open(cfg.multi_acts_path, "r").read()) + + test_list = [test_list.strip().lower() for test_list in open(cfg.test_list, "r").readlines()] + dev_list = [dev_list.strip().lower() for dev_list in open(cfg.dev_list, "r").readlines()] + self.dev_files, self.test_files = {}, {} + for fn in test_list: + self.test_files[fn.replace(".json", "")] = 1 + for fn in dev_list: + self.dev_files[fn.replace(".json", "")] = 1 + + # for domain expanse aka. Cross domain + self.exp_files = {} + # if 'all' not in cfg.exp_domains: + # for domain in cfg.exp_domains: + # fn_list = self.domain_files.get(domain) + # if not fn_list: + # raise ValueError( + # '[%s] is an invalid experiment setting' % domain) + # for fn in fn_list: + # self.exp_files[fn.replace('.json', '')] = 1 + all_domains_list = list(self.domain_files.keys()) + if "all" not in cfg.exp_domains: + domains = self.get_exp_domains(cfg.exp_domains, all_domains_list) + logging.info(domains) + for domain in domains: + fn_list = self.domain_files.get(domain) + if not fn_list: + raise ValueError("[%s] is an invalid experiment setting" % domain) + for fn in fn_list: + self.exp_files[fn.replace(".json", "")] = 1 + # + + self._load_data() + + if cfg.limit_bspn_vocab: + self.bspn_masks = self._construct_bspn_constraint() + if cfg.limit_aspn_vocab: + self.aspn_masks = self._construct_aspn_constraint() + + self.multi_acts_record = None + + def get_exp_domains(self, exp_domains, all_domains_list): + if "hotel" in exp_domains: + if "except" in exp_domains: + # ['except', 'hotel'] + domains = [d for d in all_domains_list if "hotel" not in d and "multi" not in d] + else: + # ['hotel'] + domains = ["hotel_single", "hotel_multi"] + if "train" in exp_domains: + if "except" in exp_domains: + # ['except', 'train'] + domains = [d for d in all_domains_list if "train" not in d and "multi" not in d] + else: + # ['train'] + domains = ["train_single", "train_multi"] + if "attraction" in exp_domains: + if "except" in exp_domains: + # ['except', 'attraction'] + domains = [d for d in all_domains_list if "attraction" not in d and "multi" not in d] + else: + # ['attraction'] + domains = ["attraction_single", "attraction_multi"] + if "restaurant" in exp_domains: + if "except" in exp_domains: + # ['except', 'restaurant'] + domains = [d for d in all_domains_list if "restaurant" not in d and "multi" not in d] + else: + # ['restaurant'] + domains = ["restaurant_single", "restaurant_multi"] + if "taxi" in exp_domains: + if "except" in exp_domains: + # ['except', 'taxi'] + domains = [d for d in all_domains_list if "taxi" not in d and "multi" not in d] + else: + # ['taxi'] + domains = ["taxi_single", "taxi_multi"] + return domains + + def add_sepcial_tokens(self): + """ + add special tokens to gpt tokenizer + serves a similar role of Vocab.construt() + make a dict of special tokens + """ + special_tokens = [] + for word in ontology.all_domains + ["general"]: + word = "[" + word + "]" + special_tokens.append(word) + for word in ontology.all_acts: + word = "[" + word + "]" + special_tokens.append(word) + # for word in ontology.all_slots: + # to be determine whether slot should be [slot] + # if slot, tokenizer having trouble decoding. + # special_tokens.append(word) + for word in self.vocab._word2idx.keys(): + if word.startswith("[value_") and word.endswith("]"): + special_tokens.append(word) + special_tokens.extend(ontology.special_tokens) + + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + logging.info("Added special tokens to gpt tokenizer.") + + cfg.pad_id = self.tokenizer.encode("")[0] + + def _build_vocab(self): + self.vocab = utils.Vocab(cfg.vocab_size) + vp = cfg.vocab_path_train if cfg.mode == "train" or cfg.vocab_path_eval is None else cfg.vocab_path_eval + # vp = cfg.vocab_path+'.json.freq.json' + self.vocab.load_vocab(vp) + return self.vocab.vocab_size + + def _construct_bspn_constraint(self): + bspn_masks = {} + valid_domains = [ + "restaurant", + "hotel", + "attraction", + "train", + "taxi", + "hospital", + ] + all_dom_codes = [self.vocab.encode("[" + d + "]") for d in valid_domains] + all_slot_codes = [self.vocab.encode(s) for s in ontology.all_slots] + bspn_masks[self.vocab.encode("")] = all_dom_codes + [ + self.vocab.encode(""), + 0, + ] + bspn_masks[self.vocab.encode("")] = [self.vocab.encode("")] + bspn_masks[self.vocab.encode("")] = [self.vocab.encode("")] + for domain, slot_values in self.slot_value_set.items(): + if domain == "police": + continue + dom_code = self.vocab.encode("[" + domain + "]") + bspn_masks[dom_code] = [] + for slot, values in slot_values.items(): + slot_code = self.vocab.encode(slot) + if slot_code not in bspn_masks: + bspn_masks[slot_code] = [] + if slot_code not in bspn_masks[dom_code]: + bspn_masks[dom_code].append(slot_code) + for value in values: + for idx, v in enumerate(value.split()): + if not self.vocab.has_word(v): + continue + v_code = self.vocab.encode(v) + if v_code not in bspn_masks: + # print(self.vocab._word2idx) + bspn_masks[v_code] = [] + if idx == 0 and v_code not in bspn_masks[slot_code]: + bspn_masks[slot_code].append(v_code) + if idx == (len(value.split()) - 1): + for w in all_dom_codes + all_slot_codes: + if self.vocab.encode("") not in bspn_masks[v_code]: + bspn_masks[v_code].append(self.vocab.encode("")) + if w not in bspn_masks[v_code]: + bspn_masks[v_code].append(w) + break + if not self.vocab.has_word(value.split()[idx + 1]): + continue + next_v_code = self.vocab.encode(value.split()[idx + 1]) + if next_v_code not in bspn_masks[v_code]: + bspn_masks[v_code].append(next_v_code) + bspn_masks[self.vocab.encode("")] = list(bspn_masks.keys()) + + with open("data/processed/multi-woz-processed/bspn_masks.txt", "w") as f: + for i, j in bspn_masks.items(): + f.write(self.vocab.decode(i) + ": " + " ".join([self.vocab.decode(int(m)) for m in j]) + "\n") + return bspn_masks + + def _construct_aspn_constraint(self): + aspn_masks = {} + aspn_masks = {} + all_dom_codes = [self.vocab.encode("[" + d + "]") for d in ontology.dialog_acts.keys()] + all_act_codes = [self.vocab.encode("[" + a + "]") for a in ontology.dialog_act_params] + all_slot_codes = [self.vocab.encode(s) for s in ontology.dialog_act_all_slots] + aspn_masks[self.vocab.encode("")] = all_dom_codes + [ + self.vocab.encode(""), + 0, + ] + aspn_masks[self.vocab.encode("")] = [self.vocab.encode("")] + aspn_masks[self.vocab.encode("")] = [self.vocab.encode("")] + # for d in all_dom_codes: + # aspn_masks[d] = all_act_codes + for a in all_act_codes: + aspn_masks[a] = all_dom_codes + all_slot_codes + [self.vocab.encode("")] + for domain, acts in ontology.dialog_acts.items(): + dom_code = self.vocab.encode("[" + domain + "]") + aspn_masks[dom_code] = [] + for a in acts: + act_code = self.vocab.encode("[" + a + "]") + if act_code not in aspn_masks[dom_code]: + aspn_masks[dom_code].append(act_code) + # for a, slots in ontology.dialog_act_params.items(): + # act_code = self.vocab.encode('['+a+']') + # slot_codes = [self.vocab.encode(s) for s in slots] + # aspn_masks[act_code] = all_dom_codes + slot_codes + [self.vocab.encode('')] + for s in all_slot_codes: + aspn_masks[s] = all_dom_codes + all_slot_codes + [self.vocab.encode("")] + aspn_masks[self.vocab.encode("")] = list(aspn_masks.keys()) + + with open("processed/multi-woz-processed/aspn_masks.txt", "w") as f: + for i, j in aspn_masks.items(): + f.write(self.vocab.decode(i) + ": " + " ".join([self.vocab.decode(int(m)) for m in j]) + "\n") + return aspn_masks + + def _load_data(self, save_temp=True): + """ + load processed data and encode, or load already encoded data + """ + if save_temp: # save encoded data + if "all" in cfg.exp_domains: + encoded_file = os.path.join(cfg.data_path, "new_db_se_blank_encoded.data.json") + # encoded: no sos, se_encoded: sos and eos + # db: add db results every turn + else: + xdomain_dir = "./models/UBAR/experiments_Xdomain/data" + if not os.path.exists(xdomain_dir): + os.makedirs(xdomain_dir) + encoded_file = os.path.join( + xdomain_dir, + "{}-encoded.data.json".format("-".join(cfg.exp_domains)), + ) + + if os.path.exists(encoded_file): + logging.info("Reading encoded data from {}".format(encoded_file)) + self.data = json.loads(open(cfg.data_path + cfg.data_file, "r", encoding="utf-8").read().lower()) + encoded_data = json.loads(open(encoded_file, "r", encoding="utf-8").read()) + self.train = encoded_data["train"] + self.dev = encoded_data["dev"] + self.test = encoded_data["test"] + else: + logging.info("Encoding data now and save the encoded data in {}".format(encoded_file)) + # not exists, encode data and save + self.data = json.loads(open(cfg.data_path + cfg.data_file, "r", encoding="utf-8").read().lower()) + self.train, self.dev, self.test = [], [], [] + for fn, dial in self.data.items(): + if ".json" in fn: + fn = fn.replace(".json", "") + if "all" in cfg.exp_domains or self.exp_files.get(fn): + if self.dev_files.get(fn): + self.dev.append(self._get_encoded_data(fn, dial)) + elif self.test_files.get(fn): + self.test.append(self._get_encoded_data(fn, dial)) + else: + self.train.append(self._get_encoded_data(fn, dial)) + + # save encoded data + encoded_data = {"train": self.train, "dev": self.dev, "test": self.test} + json.dump(encoded_data, open(encoded_file, "w"), indent=2) + + else: # directly read processed data and encode + self.data = json.loads(open(cfg.data_path + cfg.data_file, "r", encoding="utf-8").read().lower()) + self.train, self.dev, self.test = [], [], [] + for fn, dial in self.data.items(): + if ".json" in fn: + fn = fn.replace(".json", "") + if "all" in cfg.exp_domains or self.exp_files.get(fn): + if self.dev_files.get(fn): + self.dev.append(self._get_encoded_data(fn, dial)) + elif self.test_files.get(fn): + self.test.append(self._get_encoded_data(fn, dial)) + else: + self.train.append(self._get_encoded_data(fn, dial)) + # if save_temp: + # json.dump(self.test, open( + # 'data/multi-woz-analysis/test.encoded.json', 'w'), indent=2) + # self.vocab.save_vocab('data/multi-woz-analysis/vocab_temp') + + random.shuffle(self.train) + # random.shuffle(self.dev) + # random.shuffle(self.test) + logging.info("train size:{}, dev size:{}, test size:{}".format(len(self.train), len(self.dev), len(self.test))) + + def _get_encoded_data(self, fn, dial): + encoded_dial = [] + for idx, t in enumerate(dial["log"]): # tokenize to list of ids + enc = {} + enc["dial_id"] = fn + + # enc['user'] = self.vocab.sentence_encode(t['user'].split() + ['']) + # enc['usdx'] = self.vocab.sentence_encode(t['user_delex'].split() + ['']) + # enc['resp'] = self.vocab.sentence_encode(t['resp'].split() + ['']) + # enc['bspn'] = self.vocab.sentence_encode(t['constraint'].split() + ['']) + # enc['bsdx'] = self.vocab.sentence_encode(t['cons_delex'].split() + ['']) + # enc['aspn'] = self.vocab.sentence_encode(t['sys_act'].split() + ['']) + # enc['dspn'] = self.vocab.sentence_encode(t['turn_domain'].split() + ['']) + + # use gpt tokenizer directly tokenize word list, prone to encode unknown words to |endoftext| + # enc['user'] = self.tokenizer.encode( + # t['user'].split() + ['']) + # enc['usdx'] = self.tokenizer.encode( + # t['user_delex'].split() + ['']) + # enc['resp'] = self.tokenizer.encode( + # t['resp'].split() + ['']) + # enc['bspn'] = self.tokenizer.encode( + # t['constraint'].split() + ['']) + # enc['bsdx'] = self.tokenizer.encode( + # t['cons_delex'].split() + ['']) + # enc['aspn'] = self.tokenizer.encode( + # t['sys_act'].split() + ['']) + # enc['dspn'] = self.tokenizer.encode( + # t['turn_domain'].split() + ['']) + + # gpt use bpe to encode strings, very very slow. ~9min + # in tokenization_utils.encode I find encode can pad_to_max_length, and reutrn tensor + enc["user"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + t["user"] + " ") + ) + enc["usdx"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + t["user"] + " ") + ) + enc["resp"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + t["resp"] + " ") + ) + enc["bspn"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + t["constraint"] + " ") + ) + enc["bsdx"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + t["cons_delex"] + " ") + ) + enc["aspn"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + t["sys_act"] + " ") + ) + enc["dspn"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + t["turn_domain"] + " ") + ) + + enc["pointer"] = [int(i) for i in t["pointer"].split(",")] + enc["turn_domain"] = t["turn_domain"].split() + enc["turn_num"] = t["turn_num"] + if cfg.multi_acts_training: + enc["aspn_aug"] = [] + if fn in self.multi_acts: + turn_ma = self.multi_acts[fn].get(str(idx), {}) + for act_type, act_spans in turn_ma.items(): + enc["aspn_aug"].append([self.tokenizer.encode(a.split() + [""]) for a in act_spans]) + + # add db results to enc, at every turn + db_pointer = self.bspan_to_DBpointer(t["constraint"], t["turn_domain"].split()) + # db_tokens = ['', '', '[db_nores]', '[db_0]', '[db_1]', '[db_2]', '[db_3]'] + enc["db"] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(" " + db_pointer + " ") + ) + + encoded_dial.append(enc) + return encoded_dial + + def bspan_to_constraint_dict(self, bspan, bspn_mode="bspn"): + bspan = bspan.split() if isinstance(bspan, str) else bspan + constraint_dict = {} + domain = None + conslen = len(bspan) + for idx, cons in enumerate(bspan): + cons = self.vocab.decode(cons) if type(cons) is not str else cons + if cons == "": + break + if "[" in cons: + if cons[1:-1] not in ontology.all_domains: + continue + domain = cons[1:-1] + elif cons in ontology.get_slot: + if domain is None: + continue + if cons == "people": + # handle confusion of value name "people's portraits..." and slot people + try: + ns = bspan[idx + 1] + ns = self.vocab.decode(ns) if type(ns) is not str else ns + if ns == "'s": + continue + except Exception: + continue + if not constraint_dict.get(domain): + constraint_dict[domain] = {} + if bspn_mode == "bsdx": + constraint_dict[domain][cons] = 1 + continue + vidx = idx + 1 + if vidx == conslen: + break + vt_collect = [] + vt = bspan[vidx] + vt = self.vocab.decode(vt) if type(vt) is not str else vt + while vidx < conslen and vt != "" and "[" not in vt and vt not in ontology.get_slot: + vt_collect.append(vt) + vidx += 1 + if vidx == conslen: + break + vt = bspan[vidx] + vt = self.vocab.decode(vt) if type(vt) is not str else vt + if vt_collect: + constraint_dict[domain][cons] = " ".join(vt_collect) + + return constraint_dict + + def bspan_to_DBpointer(self, bspan, turn_domain): + constraint_dict = self.bspan_to_constraint_dict(bspan) + # print(constraint_dict) + matnums = self.db.get_match_num(constraint_dict) + match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1] + match_dom = match_dom[1:-1] if match_dom.startswith("[") else match_dom + match = matnums[match_dom] + # vector = self.db.addDBPointer(match_dom, match) + vector = self.db.addDBIndicator(match_dom, match) + return vector + + def aspan_to_act_list(self, aspan): + aspan = aspan.split() if isinstance(aspan, str) else aspan + acts = [] + domain = None + conslen = len(aspan) + for idx, cons in enumerate(aspan): + cons = self.vocab.decode(cons) if type(cons) is not str else cons + if cons == "": + break + if "[" in cons and cons[1:-1] in ontology.dialog_acts: + domain = cons[1:-1] + + elif "[" in cons and cons[1:-1] in ontology.dialog_act_params: + if domain is None: + continue + vidx = idx + 1 + if vidx == conslen: + acts.append(domain + "-" + cons[1:-1] + "-none") + break + vt = aspan[vidx] + vt = self.vocab.decode(vt) if type(vt) is not str else vt + no_param_act = True + while vidx < conslen and vt != "" and "[" not in vt: + no_param_act = False + acts.append(domain + "-" + cons[1:-1] + "-" + vt) + vidx += 1 + if vidx == conslen: + break + vt = aspan[vidx] + vt = self.vocab.decode(vt) if type(vt) is not str else vt + if no_param_act: + acts.append(domain + "-" + cons[1:-1] + "-none") + + return acts + + def dspan_to_domain(self, dspan): + domains = {} + dspan = dspan.split() if isinstance(dspan, str) else dspan + for d in dspan: + dom = self.vocab.decode(d) if type(d) is not str else d + if dom != "": + domains[dom] = 1 + else: + break + return domains + + def convert_turn_eval(self, turn, pv_turn, first_turn=False): + """ + input: [all previous ubar, U_t, B_t, A_t] predict R_t + firts turn: [U_t, B_t, A_t] predict R_t + + regarding the context, all previous ubar is too slow, try the previous ubar + """ + inputs = {} + + context_list = [] + # predict_list = [] + prompt = "" + if cfg.use_true_curr_bspn: + if cfg.use_true_curr_aspn: # only predict resp + context_list = ["user", "bspn", "db", "aspn"] + # context_list = ['user','aspn'] # predict resp based on current aspn and bspn + # predict_list = ['resp'] + prompt = "" + else: # predicted aspn + context_list = ["user", "bspn", "db"] + # predict_list = ['aspn', 'resp'] + prompt = "" + else: # predict bspn aspn resp. db are not predicted. this part tbd. + context_list = ["user"] + # predict_list = ['bspn', 'db','aspn', 'resp'] + prompt = "" + + if first_turn: + context = [] + for c in context_list: + context += turn[c] + + inputs["context"] = context + self.tokenizer.encode([prompt]) + inputs["labels"] = context + # e43 with BABAU + # inputs['labels'] = [] + + else: + context = [] + for c in context_list: + context += turn[c] + + pv_context = pv_turn["labels"] + pv_turn["bspn"] + pv_turn["db"] + pv_turn["aspn"] + pv_turn["resp"] + # e43 with BABAU + # pv_context = pv_turn['labels'] + pv_turn['bspn'] + pv_turn['db'] + pv_turn['aspn'] + + # prompt response, add sos_r + inputs["context"] = pv_context + context + self.tokenizer.encode([prompt]) + # context just the current turn + # inputs['context'] = context + self.tokenizer.encode([prompt]) + # context just the current action + + if cfg.use_all_previous_context: + inputs["labels"] = pv_context + context # use all previous ubar history + else: + inputs["labels"] = context # use previous trun + + if len(inputs["context"]) > 900: + print("len exceeds 900") + inputs["context"] = inputs["context"][-900:] + + return inputs + + def convert_batch_session(self, dial_batch): + """ + convert the whole session for training + concat [U_0, B_0, A_0, R_0, ... , U_n, B_n, A_n, R_n] + + try: [user, bspn, aspn, resp] + or + try: [user, bspn, db, aspn, resp] + """ + inputs = {} + contexts = [] + cell_list = ["user", "bspn", "db", "aspn", "resp"] + for idx, dial in enumerate(dial_batch): + context = [] + for turn_num, turn in enumerate(dial): + for cell in cell_list: + context.extend(turn[cell]) + contexts.append(context) + + inputs["contexts"] = contexts + inputs["contexts_np"], inputs["lengths"] = utils.padSeqs_gpt(inputs["contexts"], cfg.pad_id) + return inputs + + def convert_batch_gpt(self, turn_batch, pv_batch, first_turn=False): + """ + convert the current and the last turn + concat [U_{t-1}, B_{t-1}, A_{t-1}, R_{t-1}, U_t, B_t, A_t, R_t] + firts turn: [U_t, B_t, A_t, R_t] + try: [usdx, bspn, aspn, resp] + + """ + inputs = {} + if first_turn: + contexts = [] + batch_zipped = zip( + turn_batch["usdx"], + turn_batch["bspn"], + turn_batch["aspn"], + turn_batch["resp"], + ) + for u, b, a, r in batch_zipped: + context = u + b + a + r + contexts.append(context) + inputs["contexts"] = contexts + # padSeqs to make [UBAR] the same length + inputs["contexts_np"], inputs["lengths"] = utils.padSeqs_gpt(inputs["contexts"], cfg.pad_id) + else: + contexts = [] + batch_zipped = zip( + pv_batch["pv_usdx"], + pv_batch["pv_bspn"], + pv_batch["pv_aspn"], + pv_batch["pv_resp"], + turn_batch["usdx"], + turn_batch["bspn"], + turn_batch["aspn"], + turn_batch["resp"], + ) + for pu, pb, pa, pr, u, b, a, r in batch_zipped: + context = pu + pb + pa + pr + u + b + a + r + contexts.append(context) + inputs["contexts"] = contexts + contexts_np, lengths = utils.padSeqs_gpt(inputs["contexts"], cfg.pad_id) + inputs["contexts_np"] = contexts_np + inputs["lengths"] = lengths + return inputs + + def convert_batch(self, py_batch, py_prev, first_turn=False): + inputs = {} + if first_turn: + for item, py_list in py_prev.items(): + batch_size = len(py_batch["user"]) + inputs[item + "_np"] = np.array([[1]] * batch_size) + inputs[item + "_unk_np"] = np.array([[1]] * batch_size) + else: + for item, py_list in py_prev.items(): + if py_list is None: + continue + if not cfg.enable_aspn and "aspn" in item: + continue + if not cfg.enable_bspn and "bspn" in item: + continue + if not cfg.enable_dspn and "dspn" in item: + continue + prev_np = utils.padSeqs(py_list, truncated=cfg.truncated, trunc_method="pre") + inputs[item + "_np"] = prev_np + if item in ["pv_resp", "pv_bspn"]: + inputs[item + "_unk_np"] = deepcopy(inputs[item + "_np"]) + # , restrict vocab size to 3k, map ids>3k to + inputs[item + "_unk_np"][inputs[item + "_unk_np"] >= self.vocab_size] = 2 + else: + inputs[item + "_unk_np"] = inputs[item + "_np"] + + for item in ["user", "usdx", "resp", "bspn", "aspn", "bsdx", "dspn"]: + if not cfg.enable_aspn and item == "aspn": + continue + if not cfg.enable_bspn and item == "bspn": + continue + + if not cfg.enable_dspn and item == "dspn": + continue + py_list = py_batch[item] + trunc_method = "post" if item == "resp" else "pre" + # max_length = cfg.max_nl_length if item in ['user', 'usdx', 'resp'] else cfg.max_span_length + inputs[item + "_np"] = utils.padSeqs(py_list, truncated=cfg.truncated, trunc_method=trunc_method) + if item in ["user", "usdx", "resp", "bspn"]: + inputs[item + "_unk_np"] = deepcopy(inputs[item + "_np"]) + inputs[item + "_unk_np"][inputs[item + "_unk_np"] >= self.vocab_size] = 2 # + else: + inputs[item + "_unk_np"] = inputs[item + "_np"] + + if cfg.multi_acts_training and cfg.mode == "train": + inputs["aspn_bidx"], multi_aspn = [], [] + for bidx, aspn_type_list in enumerate(py_batch["aspn_aug"]): + if aspn_type_list: + for aspn_list in aspn_type_list: + random.shuffle(aspn_list) + # choose one random act span in each act type + aspn = aspn_list[0] + multi_aspn.append(aspn) + inputs["aspn_bidx"].append(bidx) + if cfg.multi_act_sampling_num > 1: + for i in range(cfg.multi_act_sampling_num): + if len(aspn_list) >= i + 2: + # choose one random act span in each act type + aspn = aspn_list[i + 1] + multi_aspn.append(aspn) + inputs["aspn_bidx"].append(bidx) + + if multi_aspn: + inputs["aspn_aug_np"] = utils.padSeqs(multi_aspn, truncated=cfg.truncated, trunc_method="pre") + # [all available aspn num in the batch, T] + inputs["aspn_aug_unk_np"] = inputs["aspn_aug_np"] + + inputs["db_np"] = np.array(py_batch["pointer"]) + inputs["turn_domain"] = py_batch["turn_domain"] + + return inputs + + def wrap_result_lm(self, result_dict, eos_syntax=None): + results = [] + eos_syntax = ontology.eos_tokens if not eos_syntax else eos_syntax + sos_syntax = ontology.sos_tokens + # ground truth bs, as, ds.. generate response + field = [ + "dial_id", + "turn_num", + "user", + "bspn_gen", + "bsdx", + "resp_gen", + "resp", + "aspn_gen", + "aspn", + "dspn_gen", + "dspn", + "bspn", + "pointer", + ] + + for dial_id, turns in result_dict.items(): + entry = {"dial_id": dial_id, "trun_num": len(turns)} + for f in field[2:]: + entry[f] = "" # ??? + results.append(entry) + for turn_idx, turn in enumerate(turns): + entry = {"dial_id": dial_id} + for key in field: + if key in ["dial_id"]: + continue + v = turn.get(key, "") + if key == "turn_domain": + v = " ".join(v) + + if key in eos_syntax and v != "": + # remove eos tokens + v = self.tokenizer.decode(v) + v = v.split() + # remove eos/sos in span + if eos_syntax[key] in v: + v.remove(eos_syntax[key]) + if sos_syntax[key] in v: + v.remove(sos_syntax[key]) + # if key != 'resp_gen': + # # remove eos/sos in span + # if eos_syntax[key] in v: + # v.remove(eos_syntax[key]) + # if sos_syntax[key] in v: + # v.remove(sos_syntax[key]) + # else: # 'resp_gen' + # sos_index = 0 + # eos_index = -1 + # if sos_syntax[key] in v: + # sos_index = v.index(sos_syntax[key]) + # if eos_syntax[key] in v: + # eos_index = v.index(eos_syntax[key]) + # else: + # pass # take too long + # # no found, stop at any eos_tokens + # # for i in range(sos_index+1, len(v)): + # # if v[i] in sos_syntax.values() or v[i] in eos_syntax.values(): + # # eos_index = i + # v = v[sos_index+1: eos_index] + + # v = self.tokenizer.convert_tokens_to_string(v) + v = " ".join(v) + else: + pass # v = v + entry[key] = v + + results.append(entry) + + return results, field + + def wrap_result(self, result_dict, eos_syntax=None): + decode_fn = self.vocab.sentence_decode + results = [] + eos_syntax = ontology.eos_tokens if not eos_syntax else eos_syntax + + if cfg.bspn_mode == "bspn": + field = [ + "dial_id", + "turn_num", + "user", + "bspn_gen", + "bspn", + "resp_gen", + "resp", + "aspn_gen", + "aspn", + "dspn_gen", + "dspn", + "pointer", + ] + elif not cfg.enable_dst: # this + field = [ + "dial_id", + "turn_num", + "user", + "bsdx_gen", + "bsdx", + "resp_gen", + "resp", + "aspn_gen", + "aspn", + "dspn_gen", + "dspn", + "bspn", + "pointer", + ] + else: + field = [ + "dial_id", + "turn_num", + "user", + "bsdx_gen", + "bsdx", + "resp_gen", + "resp", + "aspn_gen", + "aspn", + "dspn_gen", + "dspn", + "bspn_gen", + "bspn", + "pointer", + ] + if self.multi_acts_record is not None: + field.insert(7, "multi_act_gen") + + for dial_id, turns in result_dict.items(): + entry = {"dial_id": dial_id, "turn_num": len(turns)} + for prop in field[2:]: + entry[prop] = "" + results.append(entry) + for turn_no, turn in enumerate(turns): + entry = {"dial_id": dial_id} + for key in field: + if key in ["dial_id"]: + continue + v = turn.get(key, "") + if key == "turn_domain": + v = " ".join(v) + entry[key] = decode_fn(v, eos=eos_syntax[key]) if key in eos_syntax and v != "" else v + results.append(entry) + return results, field + + def restore(self, resp, domain, constraint_dict, mat_ents): + restored = resp + + restored = restored.replace("[value_reference]", "53022") + restored = restored.replace("[value_car]", "BMW") + + # restored.replace('[value_phone]', '830-430-6666') + for d in domain: + constraint = constraint_dict.get(d, None) + if constraint: + if "stay" in constraint: + restored = restored.replace("[value_stay]", constraint["stay"]) + if "day" in constraint: + restored = restored.replace("[value_day]", constraint["day"]) + if "people" in constraint: + restored = restored.replace("[value_people]", constraint["people"]) + if "time" in constraint: + restored = restored.replace("[value_time]", constraint["time"]) + if "type" in constraint: + restored = restored.replace("[value_type]", constraint["type"]) + if d in mat_ents and len(mat_ents[d]) == 0: + for s in constraint: + if s == "pricerange" and d in ["hotel", "restaurant"] and "price]" in restored: + restored = restored.replace("[value_price]", constraint["pricerange"]) + if s + "]" in restored: + restored = restored.replace("[value_%s]" % s, constraint[s]) + + if "[value_choice" in restored and mat_ents.get(d): + restored = restored.replace("[value_choice]", str(len(mat_ents[d]))) + if "[value_choice" in restored: + restored = restored.replace("[value_choice]", "3") + + # restored.replace('[value_car]', 'BMW') + + try: + ent = mat_ents.get(domain[-1], []) + if ent: + ent = ent[0] + + for t in restored.split(): + if "[value" in t: + slot = t[7:-1] + if ent.get(slot): + if domain[-1] == "hotel" and slot == "price": + slot = "pricerange" + restored = restored.replace(t, ent[slot]) + elif slot == "price": + if ent.get("pricerange"): + restored = restored.replace(t, ent["pricerange"]) + else: + print(restored, domain) + except Exception: + print(resp) + print(restored) + quit() + + restored = restored.replace("[value_phone]", "62781111") + restored = restored.replace("[value_postcode]", "CG9566") + restored = restored.replace("[value_address]", "Parkside, Cambridge") + + # if '[value_' in restored: + + # print(domain) + # # print(mat_ents) + # print(resp) + # print(restored) + return restored + + def record_utterance(self, result_dict): + decode_fn = self.vocab.sentence_decode + + ordered_dial = {} + for dial_id, turns in result_dict.items(): + diverse = 0 + turn_count = 0 + for turn_no, turn in enumerate(turns): + act_collect = {} + act_type_collect = {} + slot_score = 0 + for i in range(cfg.nbest): + aspn = decode_fn(turn["multi_act"][i], eos=ontology.eos_tokens["aspn"]) + pred_acts = self.aspan_to_act_list(" ".join(aspn)) + act_type = "" + for act in pred_acts: + d, a, s = act.split("-") + if d + "-" + a not in act_collect: + act_collect[d + "-" + a] = {s: 1} + slot_score += 1 + act_type += d + "-" + a + ";" + elif s not in act_collect: + act_collect[d + "-" + a][s] = 1 + slot_score += 1 + act_type_collect[act_type] = 1 + turn_count += 1 + diverse += len(act_collect) * 3 + slot_score + ordered_dial[dial_id] = diverse / turn_count + + ordered_dial = sorted(ordered_dial.keys(), key=lambda x: -ordered_dial[x]) + + dialog_record = {} + + with open(cfg.eval_load_path + "/dialogue_record.csv", "w") as rf: + writer = csv.writer(rf) + + for dial_id in ordered_dial: + dialog_record[dial_id] = [] + turns = result_dict[dial_id] + writer.writerow([dial_id]) + for turn_no, turn in enumerate(turns): + user = decode_fn(turn["user"], eos=ontology.eos_tokens["user"]) + bspn = decode_fn(turn["bspn"], eos=ontology.eos_tokens["bspn"]) + aspn = decode_fn(turn["aspn"], eos=ontology.eos_tokens["aspn"]) + resp = decode_fn(turn["resp"], eos=ontology.eos_tokens["resp"]) + constraint_dict = self.bspan_to_constraint_dict(bspn) + # print(constraint_dict) + mat_ents = self.db.get_match_num(constraint_dict, True) + domain = [i[1:-1] for i in self.dspan_to_domain(turn["dspn"]).keys()] + restored = self.restore(resp, domain, constraint_dict, mat_ents) + writer.writerow([turn_no, user, turn["pointer"], domain, restored, resp]) + turn_record = { + "user": user, + "bspn": bspn, + "aspn": aspn, + "dom": domain, + "resp": resp, + "resp_res": restored, + } + + resp_col = [] + aspn_col = [] + resp_restore_col = [] + for i in range(cfg.nbest): + aspn = decode_fn(turn["multi_act"][i], eos=ontology.eos_tokens["aspn"]) + resp = decode_fn(turn["multi_resp"][i], eos=ontology.eos_tokens["resp"]) + + restored = self.restore(resp, domain, constraint_dict, mat_ents) + resp_col.append(resp) + resp_restore_col.append(restored) + aspn_col.append(aspn) + + zipped = list(zip(resp_restore_col, resp_col, aspn_col)) + zipped.sort(key=lambda s: len(s[0])) + resp_restore_col = list(list(zip(*zipped))[0]) + aspn_col = list(list(zip(*zipped))[2]) + resp_col = list(list(zip(*zipped))[1]) + turn_record["aspn_col"] = aspn_col + turn_record["resp_col"] = resp_col + turn_record["resp_res_col"] = resp_restore_col + for i in range(cfg.nbest): + # aspn = decode_fn(turn['multi_act'][i], eos=ontology.eos_tokens['aspn']) + resp = resp_col[i] + aspn = aspn_col[i] + resp_restore = resp_restore_col[i] + + writer.writerow(["", resp_restore, resp, aspn]) + + dialog_record[dial_id].append(turn_record) + + # json.dump(dialog_record, open(cfg.eval_load_path + '/resultdict.json','w')) + + +if __name__ == "__main__": + reader = MultiWozReader(GPT2Tokenizer) + # for aspan in ["[general] [bye] [welcome] ","[train] [inform] trainid destination \ + # arrive leave [offerbook] [general] [reqmore] ",]: + # act = reader.aspan_to_constraint_dict(aspan.split()) + # print('!!!') + # print(act) + + for bspan in [ + "[taxi] destination golden house departure broughton house gallery arrive 19:30 [attraction]" + + " type museum name whipple museum of the history of science people 5 day monday", + "[taxi] destination golden house departure broughton house gallery arrive 19:30 [attraction]" + + " type museum name whipple museum of the history of science people 5 day monday ", + ]: + encoded = reader.vocab.sentence_encode(bspan.split()) + print(encoded) + cons = reader.bspan_to_constraint_dict(encoded, bspn_mode="bspn") + print(cons) + for bspan in [ + "[taxi] destination departure leave [hotel] name [attraction] name people day", + "[taxi] destination departure leave [hotel] name [attraction] name people day ", + ]: + encoded = reader.vocab.sentence_encode(bspan.split()) + print(encoded) + cons = reader.bspan_to_constraint_dict(encoded, bspn_mode="bsdx") + print(cons) diff --git a/src/crazyneuraluser/UBAR_code/utils.py b/src/crazyneuraluser/UBAR_code/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..74537e07a387e6bf0d9584073cc5557a1b044ccf --- /dev/null +++ b/src/crazyneuraluser/UBAR_code/utils.py @@ -0,0 +1,292 @@ +import json +import logging +from collections import OrderedDict + +import numpy as np + +from crazyneuraluser.UBAR_code import ontology + + +def py2np(list): + return np.array(list) + + +def write_dict(fn, dic): + with open(fn, "w") as f: + json.dump(dic, f, indent=2) + + +def f1_score(label_list, pred_list): + tp = len([t for t in pred_list if t in label_list]) + fp = max(0, len(pred_list) - tp) + fn = max(0, len(label_list) - tp) + precision = tp / (tp + fp + 1e-10) + recall = tp / (tp + fn + 1e-10) + f1 = 2 * precision * recall / (precision + recall + 1e-10) + return f1 + + +class Vocab(object): + def __init__(self, vocab_size=0): + self.vocab_size = vocab_size + self.vocab_size_oov = 0 # get after construction + self._idx2word = {} # word + oov + self._word2idx = {} # word + self._freq_dict = {} # word + oov + for w in [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ]: + self._absolute_add_word(w) + + def _absolute_add_word(self, w): + idx = len(self._idx2word) + self._idx2word[idx] = w + self._word2idx[w] = idx + + def add_word(self, word): + if word not in self._freq_dict: + self._freq_dict[word] = 0 + self._freq_dict[word] += 1 + + def has_word(self, word): + return self._freq_dict.get(word) + + def _add_to_vocab(self, word): + if word not in self._word2idx: + idx = len(self._idx2word) + self._idx2word[idx] = word + self._word2idx[word] = idx + + def construct(self): + decoded = sorted(self._freq_dict.keys(), key=lambda x: -self._freq_dict[x]) + print("Vocabulary size including oov: %d" % (len(decoded) + len(self._idx2word))) + if len(decoded) + len(self._idx2word) < self.vocab_size: + logging.warning( + "actual label set smaller than that configured: {}/{}".format( + len(decoded) + len(self._idx2word), self.vocab_size + ) + ) + for word in ontology.all_domains + ["general"]: + word = "[" + word + "]" + self._add_to_vocab(word) + for word in ontology.all_acts: + word = "[" + word + "]" + self._add_to_vocab(word) + for word in ontology.all_slots: + self._add_to_vocab(word) + for word in decoded: + if word.startswith("[value_") and word.endswith("]"): + self._add_to_vocab(word) + for word in decoded: + self._add_to_vocab(word) + self.vocab_size_oov = len(self._idx2word) + + def load_vocab(self, vocab_path): + self._freq_dict = json.loads(open(vocab_path + ".freq.json", "r").read()) + self._word2idx = json.loads(open(vocab_path + ".word2idx.json", "r").read()) + self._idx2word = {} + for w, idx in self._word2idx.items(): + self._idx2word[idx] = w + self.vocab_size_oov = len(self._idx2word) + print('vocab file loaded from "' + vocab_path + '"') + print("Vocabulary size including oov: %d" % (self.vocab_size_oov)) + + def save_vocab(self, vocab_path): + _freq_dict = OrderedDict(sorted(self._freq_dict.items(), key=lambda kv: kv[1], reverse=True)) + + write_dict(vocab_path + ".word2idx.json", self._word2idx) + write_dict(vocab_path + ".freq.json", _freq_dict) + + def encode(self, word, include_oov=True): + if include_oov: + if self._word2idx.get(word, None) is None: + raise ValueError("Unknown word: %s. Vocabulary should include oovs here." % word) + return self._word2idx[word] + else: + word = "" if word not in self._word2idx else word + return self._word2idx[word] + + def sentence_encode(self, word_list): + return [self.encode(_) for _ in word_list] + + def oov_idx_map(self, idx): + return 2 if idx > self.vocab_size else idx + + def sentence_oov_map(self, index_list): + return [self.oov_idx_map(_) for _ in index_list] + + def decode(self, idx, indicate_oov=False): + if not self._idx2word.get(idx): + raise ValueError("Error idx: %d. Vocabulary should include oovs here." % idx) + if not indicate_oov or idx < self.vocab_size: + return self._idx2word[idx] + else: + return self._idx2word[idx] + "(o)" + + def sentence_decode(self, index_list, eos=None, indicate_oov=False): + decoded = [self.decode(_, indicate_oov) for _ in index_list] + if not eos or eos not in decoded: + return " ".join(decoded) + else: + idx = decoded.index(eos) + return " ".join(decoded[:idx]) + + def nl_decode(self, decoded, eos=None): + return [self.sentence_decode(_, eos) + "\n" for _ in decoded] + + +def padSeqs_gpt(sequences, pad_id, maxlen=None): + lengths = [] + for x in sequences: + lengths.append(len(x)) + + num_samples = len(sequences) + seq_mexlen = np.max(lengths) + + # maxlen = 1024 + if seq_mexlen > 1024: # gpt2.n_ctx + # print('maxlen exceeds 1024') + maxlen = 1024 + else: + maxlen = seq_mexlen + + # tokenizer.encode('<|endoftext|>') = ['50256'] + # All labels set to ``-100`` are ignored (masked), the loss is only + # computed for labels in ``[0, ..., config.vocab_size]`` (from modeling_gpt2.GPT2LMHeadModel) + + x = np.ones((num_samples, maxlen)) * pad_id + for idx, s in enumerate(sequences): + if not len(s): + print("empty list was found in padSeqs") + # trunc method = 'pre' + trunc = s[-maxlen:] + trunc = np.asarray(trunc) + + # pad method = 'post' + x[idx, : len(trunc)] = trunc + + return x, lengths + + +def padSeqs( + sequences, + maxlen=None, + truncated=False, + pad_method="post", + trunc_method="pre", + dtype="int32", + value=0.0, +): + if not hasattr(sequences, "__len__"): + raise ValueError("`sequences` must be iterable.") + lengths = [] + for x in sequences: + if not hasattr(x, "__len__"): + raise ValueError("`sequences` must be a list of iterables. " "Found non-iterable: " + str(x)) + lengths.append(len(x)) + + num_samples = len(sequences) + seq_maxlen = np.max(lengths) + + if maxlen is not None and truncated: + maxlen = min(seq_maxlen, maxlen) + else: + maxlen = seq_maxlen + # take the sample shape from the first non empty sequence + # checking for consistency in the main loop below. + sample_shape = tuple() + for s in sequences: + if len(s) > 0: + sample_shape = np.asarray(s).shape[1:] + break + + x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype) + for idx, s in enumerate(sequences): + if not len(s): + print("empty list/array was found") + continue # empty list/array was found + if trunc_method == "pre": + trunc = s[-maxlen:] + elif trunc_method == "post": + trunc = s[:maxlen] + else: + raise ValueError('Truncating type "%s" not understood' % trunc_method) + + # check `trunc` has expected shape + trunc = np.asarray(trunc, dtype=dtype) + if trunc.shape[1:] != sample_shape: + raise ValueError( + "Shape of sample %s of sequence at position %s is different from expected shape %s" + % (trunc.shape[1:], idx, sample_shape) + ) + + if pad_method == "post": + x[idx, : len(trunc)] = trunc + elif pad_method == "pre": + x[idx, -len(trunc) :] = trunc + else: + raise ValueError('Padding type "%s" not understood' % pad_method) + return x + + +def get_glove_matrix(glove_path, vocab, initial_embedding_np): + """ + return a glove embedding matrix + :param self: + :param glove_file: + :param initial_embedding_np: + :return: np array of [V,E] + """ + ef = open(glove_path, "r", encoding="UTF-8") + cnt = 0 + vec_array = initial_embedding_np + old_avg = np.average(vec_array) + old_std = np.std(vec_array) + vec_array = vec_array.astype(np.float32) + new_avg, new_std = 0, 0 + + for line in ef.readlines(): + line = line.strip().split(" ") + word, vec = line[0], line[1:] + vec = np.array(vec, np.float32) + if not vocab.has_word(word): + continue + word_idx = vocab.encode(word) + if word_idx < vocab.vocab_size: + cnt += 1 + vec_array[word_idx] = vec + new_avg += np.average(vec) + new_std += np.std(vec) + new_avg /= cnt + new_std /= cnt + ef.close() + logging.info( + "%d known embedding. old mean: %f new mean %f, old std %f new std %f" + % (cnt, old_avg, new_avg, old_std, new_std) + ) + return vec_array + + +def position_encoding_init(self, n_position, d_pos_vec): + position_enc = np.array( + [ + [pos / np.power(10000, 2 * (j // 2) / d_pos_vec) for j in range(d_pos_vec)] + if pos != 0 + else np.zeros(d_pos_vec) + for pos in range(n_position) + ] + ) + + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc diff --git a/src/crazyneuraluser/user_model_code/analysis_multiwoz.py b/src/crazyneuraluser/user_model_code/analysis_multiwoz.py new file mode 100644 index 0000000000000000000000000000000000000000..33b7ccbf4def4767cad329a4d6917882b5c00616 --- /dev/null +++ b/src/crazyneuraluser/user_model_code/analysis_multiwoz.py @@ -0,0 +1,119 @@ +import json +import os + +DATA_SPLIT = ["train", "dev", "test"] + + +def _check_n_turns(data, data_act): + for split in DATA_SPLIT: + for dial_id, meta in data[split].items(): + n_in_meta = len(meta["turns"]) + + assert dial_id in data_act + n_in_act = len(data_act[dial_id]) + assert n_in_meta == n_in_act + + +def collect_data(data_path, remove_dial_switch=False): + # load act + act_file = os.path.join(data_path, "dialog_acts.json") + with open(act_file) as f: + data_act = json.load(f) + print("Load {} dialogues in act file".format(len(data_act))) + + # load data + data = {} + for split in DATA_SPLIT: + data[split] = iter_data_folder(data_path, split, remove_dial_switch, data_act) + + _check_n_turns(data, data_act) + return data, data_act + + +def remove_dial(dial_id, dial, dial_act): + # check services + services = dial["services"] + if "police" in services or "bus" in services or "hospital" in services: + return True + + # check act + domains = set() + for turn_id, turn_act in dial_act.items(): + dialogue_act = turn_act["dialog_act"] + for dact in dialogue_act: + assert "-" in dact + domain, act = dact.split("-") + domains.add(domain) + if "Police" in domains or "Bus" in domains or "Hospital" in domains: + return True + return False + + +def iter_data_folder(data_path, split, remove_dial_switch, data_act): + """Iterate data folder""" + split_dir = os.path.join(data_path, split) + data_split = {} + remove_dial_ids = [] + total_dial_ids = [] + for f in os.listdir(split_dir): + if not f.startswith("dialogues"): # skip schema.json + continue + file_path = os.path.join(data_path, split, f) + iter_file( + file_path, + data_split, + remove_dial_ids, + total_dial_ids, + remove_dial_switch, + data_act, + ) + print( + "Done collecting {} | total {} dialogues | load {} dialogues | remove {} dialogues".format( + split, len(total_dial_ids), len(data_split), len(remove_dial_ids) + ) + ) + return data_split + + +def iter_file( + file_path, data_split, remove_dial_ids, total_dial_ids, remove_dial_switch, data_act +): + with open(file_path) as f: + data_in = json.load(f) # list of dialouges in a json file + + for dial in data_in: + dial_id = dial["dialogue_id"] + total_dial_ids.append(dial_id) + dial_act = data_act[dial_id] + + if remove_dial_switch and remove_dial(dial_id, dial, dial_act): + remove_dial_ids.append(dial_id) + else: + data_split[dial_id] = dial + + +def show_dial(dial_id, data, data_act): + def simple_linearise_act(dialouge_act): + linear_act = "" + for domain_act, slot_value_list in dialouge_act.items(): + linear_act += domain_act + " " + for slot_value in slot_value_list: + slot, value = slot_value[0], slot_value[1] + linear_act += slot + " " + linear_act += value + " " + return linear_act + + split = None + for data_split in DATA_SPLIT: + if dial_id in data[data_split]: + split = data_split + break + + print("dial_id: {}".format(dial_id)) + for turn_id, turn in enumerate(data[split][dial_id]["turns"]): + dialouge_act = data_act[dial_id][str(turn_id)]["dialog_act"] + linear_act = simple_linearise_act(dialouge_act) + print("-----" * 15) + print("turn_id: {}, spk: {}".format(turn_id, turn["speaker"])) + print("act: |{}|".format(linear_act)) + print("utt: |{}|".format(turn["utterance"])) diff --git a/src/crazyneuraluser/user_model_code/analysis_sgd.py b/src/crazyneuraluser/user_model_code/analysis_sgd.py new file mode 100644 index 0000000000000000000000000000000000000000..bcdeaf1702a0cbd04f891f463d69470ac08d7bbc --- /dev/null +++ b/src/crazyneuraluser/user_model_code/analysis_sgd.py @@ -0,0 +1,483 @@ +import json +import os + +from utils_sgd import ( + bcolors, + compare_slot_values_in_state, + dict2str, + get_turn_act, + list2str, +) + +""" This file contains some utilities for analysis and parsing SGD """ + +DATA_SPLIT = ["train", "dev", "test"] + + +def collect_data(data_path, remove_dial_switch=False): + data = {} + for split in DATA_SPLIT: + data[split] = iter_data_folder(data_path, split, remove_dial_switch) + return data + + +def _remove_dial(dial_id, dial): + # remove_flag = False + # removes service `Homes_2` in test set as the slot `intent` is the same name as the user intent, + # which causes problem in goal preparation + if "Homes_2" in dial["services"]: + return True + return False + + +def iter_data_folder(data_path, split, remove_dial_switch): + """Iterate data split folder""" + split_dir = os.path.join(data_path, split) + data_split = {} + remove_dial_ids = [] + total_dial_ids = [] + for f in os.listdir(split_dir): + if not f.startswith("dialogues"): # skip schema.json + continue + file_path = os.path.join(data_path, split, f) + iter_file( + file_path, data_split, remove_dial_ids, total_dial_ids, remove_dial_switch + ) + print( + "Done collecting {} | total {} dialogues | load {} dialogues | remove {} dialogues".format( + split, len(total_dial_ids), len(data_split), len(remove_dial_ids) + ) + ) + return data_split + + +def iter_file( + file_path, data_split, remove_dial_ids, total_dial_ids, remove_dial_switch +): + """Iterate data file""" + with open(file_path) as f: + data_in = json.load(f) # list of dialouges in a json file + + for dial in data_in: + dial_id = dial["dialogue_id"] + total_dial_ids.append(dial_id) + + if remove_dial_switch and _remove_dial(dial_id, dial): + remove_dial_ids.append(dial_id) + else: + data_split[dial_id] = dial + + +def check_multiple_services_per_turn(data): + for split in DATA_SPLIT: + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + for turn_id, turn in enumerate(dial["turns"]): + frames = turn["frames"] + if len(frames) > 1: + print(split, dial_id, turn_id, turn["utterance"]) + + +def show_actions(actions): + for action_id, action in enumerate(actions): + act, slot, values = action["act"], action["slot"], action["values"] + print( + f"====> ACTION | Act {action_id}: {bcolors.RED}{act}{bcolors.ENDC}, \ + slot: {bcolors.YELLOW}{slot}{bcolors.ENDC}, values: {bcolors.GREEN}{values}{bcolors.ENDC}" + ) + + +def show_user_state(frame): + state = frame["state"] + active_intent = state["active_intent"] + req_slots = list2str(state["requested_slots"]) + slot2value = dict2str(state["slot_values"], colored=True) + print( + "====> STATE | intent: {}, req_slots: {}, slot2value: {}".format( + active_intent, req_slots, slot2value + ) + ) + + +def show_service_call(frame): + if "service_call" not in frame: + return + # system calls api + service_call, service_results = frame["service_call"], frame["service_results"] + print( + "====> API call | method: {}, args: {}, results: {}".format( + service_call["method"], + dict2str(service_call["parameters"]), + len(service_results), + ) + ) + + +def show_frame(spk, frame_id, frame): + service = frame["service"] + print("==> Frame_id: {}, service: {}".format(frame_id, service)) + + # actions (include all slots) + show_actions(frame["actions"]) + + # slots (only provide non-categorical slots with word span boundaries) + if spk == "USER": + show_user_state(frame) + else: # system + show_service_call(frame) + + +def show_turn(turn_id, turn): + if turn is None: + return + + frames = turn["frames"] + spk = turn["speaker"] + utt = turn["utterance"] + assert spk in ["USER", "SYSTEM"] + print(f"{spk}: {bcolors.UNDERLINE}{utt}{bcolors.ENDC}") + for frame_id, frame in enumerate(frames): + show_frame(spk, frame_id, frame) + print("------" * 15) + + +def show_dial_info(dial_id, dial): + print("\n") + print("******" * 15) + print("Dialogue={} | Service={}".format(dial_id, list2str(dial["services"]))) + print("******" * 15) + + +def show_dial(dial_id, dial): + show_dial_info(dial_id, dial) + for turn_id, turn in enumerate(dial["turns"]): + show_turn(turn_id, turn) + + +def show_data(data): + for split in DATA_SPLIT: + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + show_dial(dial_id, dial) + input("press...") + + +def identify_scenarios(data): + """ + According to dataset paper, a scenario is a sequence of intents, seeded at the start of a conversation + to the user agent + """ + # TODO: deal with NONE intent, check the # of intent seq conbinations + for split in DATA_SPLIT: + scenario2dialogues = {} + n_scenario_max, n_scenario_min = 0, 100 + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + scenario = [] + for turn in dial["turns"]: + if turn["speaker"] == "SYSTEM": + continue + # USER turn + # it's fine to consider only first frame (service) if the turn is at the bounrary between two services + frame = turn["frames"][0] + intent = frame["state"]["active_intent"] + if intent == "NONE": + continue + if len(scenario) == 0 or intent != scenario[-1]: + scenario.append(intent) + + # update count + if len(scenario) > n_scenario_max: + n_scenario_max = len(scenario) + if len(scenario) < n_scenario_min: + n_scenario_min = len(scenario) + + scenario = list2str(scenario) + if scenario not in scenario2dialogues: + scenario2dialogues[scenario] = [] + scenario2dialogues[scenario].append(dial_id) + + # done iter over split + print( + "Summary: split={}, unique_scenario={}, max_intent={}, min_intent={}".format( + split, len(scenario2dialogues), n_scenario_max, n_scenario_min + ) + ) + + +def _check_request_alts_type(prev_turn, sys_turn, curr_turn, curr_acts): + """ + check which of the following happens when request_alts + 1. randomly change goal (state changes) + 2. request_alts as system provides venue with missing slot-value (usr provides new info) + 3. simply dislike the provided venue, change venue without new slot-value (same info) + + Input: + prev_turn: previous user turn + curr_turn: current user turn + """ + + def _get_intent2state(turn): + intent2state = {} + for frame in turn["frames"]: + state = frame["state"] + intent = state["active_intent"] + intent2state[intent] = state + return intent2state + + assert "REQUEST_ALTS" in curr_acts + if len(curr_acts) == 1: # case 3 + # return "_dislike_" + if "OFFER" in get_turn_act(sys_turn): + return "_dislike_offer_" + else: + return "_dislike_info_" + elif ( + "INFORM" in curr_acts and len(set(curr_acts)) == 2 + ): # only inform and request_alts + assert len(curr_turn["frames"]) == 1 + curr_slot_values = curr_turn["frames"][0]["state"]["slot_values"] + curr_intent = curr_turn["frames"][0]["state"]["active_intent"] + + if len(prev_turn["frames"]) == 1: + prev_slot_values = prev_turn["frames"][0]["state"]["slot_values"] + else: # need to get the state with the same intent + intent2state = _get_intent2state(prev_turn) + prev_slot_values = intent2state[curr_intent]["slot_values"] + + state_diff = compare_slot_values_in_state(prev_slot_values, curr_slot_values) + if state_diff: # case 1 + return "_random_" + else: # case 2 + return "_miss_" + else: + return "_unknown_" + + +def stats_request_alts_type(data): + for split in DATA_SPLIT: + stats = { + "_random_": 0, + "_miss_": 0, + "_dislike_offer_": 0, + "_dislike_info_": 0, + "_unknown_": 0, + } + n_all_usr_turn, n_request_alts = 0, 0 + + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + for turn_id, turn in enumerate(dial["turns"]): + prev_turn = turn + if turn["speaker"] == "SYSTEM": + sys_turn = turn + continue + acts = get_turn_act(turn) + if "REQUEST_ALTS" in acts: + n_request_alts += 1 + type_result = _check_request_alts_type( + prev_turn, sys_turn, turn, acts + ) + stats[type_result] += 1 + if type_result == "_random_": + print("CASE {}".format(type_result)) + show_turn(0, prev_turn) + show_turn(0, sys_turn) + show_turn(0, turn) + input("press...") + n_all_usr_turn += 1 + prev_turn = turn + + print("REQUEST_ALTS type statistics") + for k, v in stats.items(): + print("{} => {}".format(k, v)) + print( + "request_alts turns: {}, all usr turns: {}, dialogues: {}".format( + n_request_alts, n_all_usr_turn, len(data[split]) + ) + ) + + +def show_utt_by_act(data): + target_act = "OFFER" + for split in DATA_SPLIT: + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + match_flag = False + for turn_id, turn in enumerate(dial["turns"]): + acts = get_turn_act(turn) + if target_act in acts: + match_flag = True + if match_flag: + show_dial(dial_id, dial) + input("press...") + + +def show_state_with_value_change(data): + for split in DATA_SPLIT: + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + intent2slot_values = {} + for turn_id, turn in enumerate(dial["turns"]): + utt, spk = turn["utterance"], turn["speaker"] + if spk != "USER": + prev_system_turn = turn + continue + for frame in turn["frames"]: + state = frame["state"] + active_intent = state["active_intent"] + slot_values = state["slot_values"] + if active_intent in intent2slot_values: + state_diff = compare_slot_values_in_state( + intent2slot_values[active_intent], slot_values + ) + if state_diff: + print( + "Dial: {}, state change: {}".format(dial_id, state_diff) + ) + print( + "==> Prev SYS: {}".format(prev_system_turn["utterance"]) + ) + for sys_frame in prev_system_turn["frames"]: + show_actions(sys_frame["actions"]) + print("==> Curr USR: {}".format(utt)) + show_actions(frame["actions"]) + print( + "recorded state => intent: {}, slot2value: {}".format( + active_intent, + dict2str(intent2slot_values[active_intent]), + ) + ) + print( + "current state => intent: {}, slot2value: {}".format( + active_intent, dict2str(slot_values) + ) + ) + input("press...") + intent2slot_values[ + active_intent + ] = slot_values # overlap with new state, no matter values changed or not + + +def check_state_with_value_change(data, display=False): + for split in DATA_SPLIT: + n_diff = {"NOTIFY_FAILURE": 0, "NEGATE": 0, "REQUEST_ALTS": 0, "RANDOM": 0} + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + intent2slot_values = {} + diff_flag = False + for turn_id, turn in enumerate(dial["turns"]): + if diff_flag: + break + utt, spk = turn["utterance"], turn["speaker"] + if spk != "USER": + prev_system_turn = turn + continue + for frame in turn["frames"]: + state = frame["state"] + active_intent = state["active_intent"] + slot_values = state["slot_values"] + if active_intent in intent2slot_values: + state_diff = compare_slot_values_in_state( + intent2slot_values[active_intent], slot_values + ) + if state_diff: + usr_acts = get_turn_act(turn) + if "NOTIFY_FAILURE" in get_turn_act(prev_system_turn): + if display: + print("FAILURE", dial_id, utt) + n_diff["NOTIFY_FAILURE"] += 1 + elif "NEGATE" in usr_acts: + if display: + print("NEGATE", dial_id, utt) + n_diff["NEGATE"] += 1 + elif "REQUEST_ALTS" in usr_acts: + if display: + print("REQUEST_ALTS", dial_id, utt) + n_diff["REQUEST_ALTS"] += 1 + else: + if display: + print("RANDOM", dial_id, utt) + n_diff["RANDOM"] += 1 + if display: + input("press...") + # n_diff += 1 + diff_flag = True + intent2slot_values[ + active_intent + ] = slot_values # overlap with new state, no matter values changed or not + n = ( + n_diff["NOTIFY_FAILURE"] + + n_diff["NEGATE"] + + n_diff["REQUEST_ALTS"] + + n_diff["RANDOM"] + ) + print( + "{} => total dials: {}, change goal dials: {} (total: {})".format( + split, len(data[split]), dict2str(n_diff), n + ) + ) + + +def stats_after_system(data): + """ + check the possible user behavior right after system offers/notify_failure + """ + n = 0 + stats = { + "SELECT": 0, + "REQUEST_ALTS": 0, + "REQUEST": 0, + "AFFIRM": 0, + "unknown": 0, + } # if system offers + # stats = {"INFORM": 0, "AFFIRM": 0, "NEGATE": 0, "unknown": 0} # if system notify_failure + for split in DATA_SPLIT: + for dial_id in sorted(data[split].keys()): + dial = data[split][dial_id] + for turn_id, turn in enumerate(dial["turns"]): + if turn_id == 0: + prev_turn = turn + continue + if turn["speaker"] == "SYSTEM": + sys_turn = turn + continue + + if "OFFER" in get_turn_act(sys_turn): + # if "OFFER" in get_turn_act(sys_turn) and "NOTIFY_FAILURE" in get_turn_act(sys_turn): + # if "NOTIFY_FAILURE" in get_turn_act(sys_turn): + n += 1 + acts = get_turn_act(turn) + # OFFER + if "SELECT" in acts: + stats["SELECT"] += 1 + elif "REQUEST_ALTS" in acts: + stats["REQUEST_ALTS"] += 1 + elif "REQUEST" in acts: + stats["REQUEST"] += 1 + elif ( + "AFFIRM" in acts + ): # cases fall into here are SYS_ACT: ["OFFER", "NOTIFY_FAILURE"], and USR_ACT: ["AFFIRM"], + # e.g., accept new proposal + show_turn(0, prev_turn) + show_turn(0, sys_turn) + show_turn(0, turn) + input("press...") + stats["AFFIRM"] += 1 + else: + stats["unknown"] += 1 + + # NOTIFY_FAILURE + # if "INFORM" in acts: + # stats["INFORM"] += 1 + # elif "AFFIRM" in acts: + # stats["AFFIRM"] += 1 + # elif "NEGATE" in acts: + # stats["NEGATE"] += 1 + # else: + # stats["unknown"] += 1 + + prev_turn = turn + for k, v in stats.items(): + print("{} -> {}".format(k, v)) + print("Total offer turns: {}".format(n)) diff --git a/src/crazyneuraluser/user_model_code/argument.py b/src/crazyneuraluser/user_model_code/argument.py new file mode 100644 index 0000000000000000000000000000000000000000..9137b96aed7a2799aacd77c314625f1eb15b4371 --- /dev/null +++ b/src/crazyneuraluser/user_model_code/argument.py @@ -0,0 +1,153 @@ +import argparse + + +def str2bool(v): + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def verify_args(args): + # datasets + assert isinstance(args.data_list, list) and len(args.data_list) > 0 + for data_name in args.data_list: + assert data_name in ["sgd", "multiwoz"] + + # mode + assert args.mode in ["training", "finetune", "testing", "interact"] + if args.mode == "finetune": + assert args.pre_checkpoint != "" + + +def get_args(): + parser = argparse.ArgumentParser(description="") + # logging + parser.add_argument("--wandb_train_run_name", type=str, default="Default name") + # data + parser.add_argument( + "--data_dir", + type=str, + default="proc_data", + help="Directory of processed datasets", + ) + parser.add_argument( + "--data_list", + type=str, + nargs="+", + default="", + help="Datasets involved, split by space, e.g., `sgd multiwoz`", + ) + + # design control + parser.add_argument( + "--use_ra_flag", + type=str2bool, + default=True, + help="Whether to use `request_alternatives` flag", + ) + + # training + parser.add_argument("--mode", type=str, required=True, help="") + parser.add_argument("--seed", type=int, default=1122) + parser.add_argument( + "--model_name", type=str, required=True, help="Unique name, e.g., job id" + ) + parser.add_argument("--model_name_or_path", type=str, default="gpt2") + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size of training per gpu" + ) + parser.add_argument( + "--eval_batch_size", + type=int, + default=1, + help="Batch size of evaluation per gpu", + ) # TODO: make decoding parallel + parser.add_argument("--gradient_accumulation_steps", type=int, default=4) + parser.add_argument("--learning_rate", type=float, default=6.25e-5) # tune + parser.add_argument("--adam_epsilon", type=float, default=1e-12) + parser.add_argument("--max_grad_norm", type=float, default=1.0) + parser.add_argument("--max_epoch", type=int, default=20) + parser.add_argument( + "--fp16", type=str2bool, default=False, help="Whether to use float16" + ) + parser.add_argument( + "--use_scheduler", + type=str2bool, + default=True, + help="Whether to use lr scheduler", + ) + parser.add_argument("--warmup_steps", type=int, default=0) + parser.add_argument( + "--checkpoint", + type=str, + default="", + required=True, + help="Path of your trained model", + ) + parser.add_argument( + "--pre_checkpoint", + type=str, + default="", + help="Path of the pretrained model used for finetuning", + ) + parser.add_argument( + "--train_size", + type=int, + default=-1, + help="How many examples used for training. -1 means all data", + ) + parser.add_argument( + "--eval_size", + type=int, + default=-1, + help="How many examples used for evaluation. -1 means all data", + ) + parser.add_argument( + "--eval_interval", + type=int, + default=1000, + help="During training, how frequent to evaluate the model in terms of training examples", + ) + parser.add_argument( + "--no_improve_max", + type=int, + default=100, + help="The max tolerance for model not improving", + ) + parser.add_argument("--eps", type=float, default=1e-12) + parser.add_argument( + "--disable_display", type=str2bool, default=False, help="display progress bar" + ) + + # decoding + # parser.add_argument('--step', type=int, default=-1) # load model trained at which specific step + parser.add_argument( + "--dec_max_len", type=int, default=2000 + ) # we use early stop to stop generation when hits + parser.add_argument("--num_beams", type=int, default=1) + parser.add_argument("--temperature", type=float, default=1.0) + # parser.add_argument('--top_k', type=int, default=0) + # parser.add_argument('--top_p', type=int, default=0) + parser.add_argument("--decode_file", type=str, default="") + parser.add_argument( + "--eye_browse_output", + type=str2bool, + default=False, + help="Whether to eye browse decoded results", + ) + + # ddp + parser.add_argument( + "--local_rank", + type=int, + default=-1, + help="Local rank for distributed training (-1: not distributed)", + ) + + args = parser.parse_args() + verify_args(args) + print(args) + return args diff --git a/src/crazyneuraluser/user_model_code/dataset.py b/src/crazyneuraluser/user_model_code/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b9e51f785685afc33eee782344108b7891a25452 --- /dev/null +++ b/src/crazyneuraluser/user_model_code/dataset.py @@ -0,0 +1,297 @@ +import json +import os + +import torch +from tqdm import tqdm + +from crazyneuraluser.user_model_code.utils_sgd import ( + add_str, + get_special_tokens, + wrap_element, +) + + +class SGD_Dataset(torch.utils.data.Dataset): + def __init__(self, args, tokenizer, data_split, generation, data_size): + assert data_split in ["train", "dev", "test", "demo"] + self.args = args + self.data_size = data_size + self.tokenizer = tokenizer + self.data_split = data_split + self.generation = generation + self.n_trimmed = 0 + + self.SPECIAL_TOKENS = get_special_tokens() + self._get_special_token_ids() + + # create examples + self.examples = [] + for data_name in args.data_list: + examples = self._create_examples(data_name, data_split) + self.examples += examples + print("Total ({}) -> {} examples".format(data_split, len(self.examples))) + + def _get_special_token_ids(self): + self.bos_id = self.tokenizer.convert_tokens_to_ids( + self.SPECIAL_TOKENS["bos_token"] + ) + self.eos_id = self.tokenizer.convert_tokens_to_ids( + self.SPECIAL_TOKENS["eos_token"] + ) + self.pad_id = self.tokenizer.convert_tokens_to_ids( + self.SPECIAL_TOKENS["pad_token"] + ) + self.sep_id = self.tokenizer.convert_tokens_to_ids( + self.SPECIAL_TOKENS["sep_token"] + ) + # print('SPECIAL TOKEN MAPPING:') + # print('bos:{} | eos:{} | pad:{} | sep:{}'.format(self.bos_id, self.eos_id, self.pad_id, self.sep_id)) + + self.add_special_token_ids = {} + for token in self.SPECIAL_TOKENS["additional_special_tokens"]: + self.add_special_token_ids[token] = self.tokenizer.convert_tokens_to_ids( + token + ) + + self.true_token, self.false_token = "_True_", "_False_" + assert self.true_token in self.SPECIAL_TOKENS["additional_special_tokens"] + assert self.false_token in self.SPECIAL_TOKENS["additional_special_tokens"] + """ + if using BPE (default method, simply call tokenizer(natural sentence)), no need unk_token + if using convert_tokens_to_ids, check which is correct way to handle oov: + a) simply use as unk_token (default setup) or + b) add unk_token into special tokens + """ + + def _create_examples(self, data_name, data_split): + data_file = os.path.join( + self.args.data_dir, data_name, "{}.json".format(data_split) + ) + with open(data_file) as f: + data = json.load(f) + + examples = [] + for dial_id in tqdm(sorted(data.keys())): + if self.data_size != -1 and len(examples) >= self.data_size: + break + dial_meta = data[dial_id] + context = "" + for i in range(100): + example_id = "{}-{}".format(dial_id, i) + self.example_id = example_id + if example_id not in dial_meta: + break + + # testing # + # # SGD + # if data_split == "test" and dial_id not in ["10_00056", "10_00075"]: # seen, movie domain + # if data_split == "test" and dial_id not in ["16_00040"]: # seen + # if data_split == "test" and dial_id not in ["8_00066", "16_00095", "8_00065"]: # unseen + # if data_split == "test" and dial_id not in ["9_00121", "9_00122"]: + # # req_alts cases w/i, w/o inform + # continue + # # mwoz + # if data_split == "test" and dial_id not in ["MUL0071.json"]: + # # test predictions in no offer & no book + # continue + + # turn info + goal = dial_meta[example_id]["goal"] + # service = dial_meta[example_id]["service"] + # intent = dial_meta[example_id]["intent"] + + # utterances + usr_utt = dial_meta[example_id]["utterances"]["usr"] + sys_utt = dial_meta[example_id]["utterances"]["sys"] + + # actions + usr_act = dial_meta[example_id]["actions"]["usr"] + sys_act = dial_meta[example_id]["actions"]["sys"] + + # binary flags + snt = dial_meta[example_id]["start_new_task"] + gc = dial_meta[example_id]["goal_change"] + ra = dial_meta[example_id]["req_alts"] + + # get input ids + ( + input_seq, + input_ids, + label_ids, + valid_example, + ) = self._prepare_input_ids( + goal, context, usr_utt, usr_act, sys_utt, sys_act, snt, gc, ra + ) + + if valid_example: + assert len(input_ids) < 1024 + dial_meta[example_id]["context"] = context + examples.append( + { + "input_ids": input_ids, # list of ids + "label_ids": label_ids, # list of ids + "metadata": dial_meta[example_id], + "example_id": self.example_id, + "data_name": data_name, + } + ) + + # collect context + sys_utt_wrap = wrap_element("SYS", sys_utt) + usr_utt_wrap = wrap_element("USR", usr_utt) + context = add_str(context, sys_utt_wrap) + context = add_str(context, usr_utt_wrap) + + print( + "Data Stat: {} ({}) -> {} examples ({} examples are trimmed)".format( + data_name, self.data_split, len(examples), self.n_trimmed + ) + ) + return examples + + def _prepare_input_ids( + self, goal, context, usr_utt, usr_act, sys_utt, sys_act, snt, gc, ra + ): + """ + prepare input sequence ids to GPT2 + template: + """ + goal_wrap = wrap_element("GOAL", goal) + context_wrap = wrap_element("CTX", context) + usr_utt_wrap = wrap_element("USR_UTT", usr_utt) + usr_act_wrap = wrap_element("USR_ACT", usr_act) + sys_utt_wrap = wrap_element("SYS_UTT", sys_utt) + sys_act_wrap = wrap_element("SYS_ACT", sys_act) + + snt = self.true_token if snt else self.false_token # `Start New Task` flag + snt_wrap = wrap_element("SNT", snt) + gc = self.true_token if gc else self.false_token # `Goal Change` flag + gc_wrap = wrap_element("GC", gc) + ra = self.true_token if ra else self.false_token # `Request Alternatives` flag + ra_wrap = wrap_element("RA", ra) + if self.args.use_ra_flag: + flags_wrap = snt_wrap + " " + ra_wrap + " " + gc_wrap + else: + flags_wrap = snt_wrap + " " + gc_wrap + + if not self.generation: # supervised + input_seq = ( + context_wrap + + " " + + sys_utt_wrap + + " " + + sys_act_wrap + + " " + + flags_wrap + + " " + + goal_wrap + + " " + + usr_act_wrap + + " " + + usr_utt_wrap + + " " + + self.SPECIAL_TOKENS["eos_token"] + ) + input_ids = self.tokenizer(input_seq)["input_ids"] # convert to ids + label_ids = self._get_labels(input_ids) + else: # generation + input_seq = ( + context_wrap + + " " + + sys_utt_wrap + + " " + + sys_act_wrap + + " " + + flags_wrap + + " " + + goal_wrap + + " " + + "" + ) # + " " + usr_act_wrap + " " + usr_utt_wrap + input_ids = self.tokenizer(input_seq)["input_ids"] # convert to ids + label_ids = None + + valid_example = True + if len(input_ids) > 1023: + print("{}: {}".format(self.n_trimmed, self.example_id)) + self.n_trimmed += 1 + valid_example = False + + return input_seq, input_ids, label_ids, valid_example + + def _get_labels(self, input_ids): + for special_token in ["", "", ""]: + special_token_id = self.add_special_token_ids[special_token] + assert input_ids.count(special_token_id) == 1 + + label_ids = [-100] * len(input_ids) + + # sys act signal interval + start_position = input_ids.index(self.add_special_token_ids[""]) + end_position = input_ids.index(self.add_special_token_ids[""]) + 1 + label_ids[start_position:end_position] = input_ids[start_position:end_position] + + # usr act and utt singal interval + start_position = input_ids.index(self.add_special_token_ids[""]) + assert self.eos_id == input_ids[-1] + label_ids[start_position:] = input_ids[start_position:] + assert len(label_ids) == len(input_ids) + return label_ids + + def _pad(self, sentences, pad_id): + max_len = max((map(len, sentences))) + attention_mask = [] + sentences_pad = [] + for sent in sentences: + pad_len = max_len - len(sent) + sentences_pad.append(sent + [pad_id] * pad_len) + attention_mask.append([1] * len(sent) + [0] * pad_len) + return sentences_pad, attention_mask + + def __len__(self): # required + return len(self.examples) + + def __getitem__(self, index): # required + """ + index will be ramdomly sampled by the fed sampler, we dont need to worry about index + """ + return self.examples[index] + + def collate_fn(self, batch): # optional but useful + """ + when collate_fn is given to the torch dataloader, we can do further actions to the batch, e.g., + tensor can be formed here a batch is formed as a list where each element is a defined data returned + by __getitem__, andy + """ + input_ids = [example["input_ids"] for example in batch] + input_ids, attention_mask = self._pad(input_ids, self.pad_id) + input_ids, attention_mask = torch.tensor(input_ids).long().to( + self.args.device + ), torch.tensor(attention_mask).long().to(self.args.device) + + if not self.generation: + label_ids = [example["label_ids"] for example in batch] + label_ids, _ = self._pad(label_ids, -100) + label_ids = torch.tensor(label_ids).long().to(self.args.device) + else: + label_ids = None + token_type_ids = None + + # store info for scoring + metadata = [ex["metadata"] for ex in batch] + example_id = [ex["example_id"] for ex in batch] + data_name = [ex["data_name"] for ex in batch] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + "label_ids": label_ids, + "metadata": metadata, + "example_id": example_id, + "data_name": data_name, + } + + +if __name__ == "__main__": + pass diff --git a/src/crazyneuraluser/user_model_code/utils_generation.py b/src/crazyneuraluser/user_model_code/utils_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..1d509d06931620a2f218bc1f29c7f43f209530f0 --- /dev/null +++ b/src/crazyneuraluser/user_model_code/utils_generation.py @@ -0,0 +1,210 @@ +import torch + +from crazyneuraluser.user_model_code.utils_sgd import add_str, bcolors, wrap_element + + +def find_segment(gen, tag): + assert isinstance(gen, str) + gen = gen.split() + try: + start = gen.index("<{}/>".format(tag)) + 1 + end = gen.index("".format(tag)) + segment = " ".join(gen[start:end]) + except Exception: + print("Missing {} tag in generated sequence".format(tag)) + segment = None + return segment + + +def segment_gen(gen, dial_id): + def _color(_segment): + if tag == "CTX": + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"USR: {bcolors.OKCYAN}") + _segment = _segment.replace(" ", f"SYS: {bcolors.OKBLUE}") + if tag == "SYS_UTT": + _segment = f"{bcolors.OKBLUE}" + _segment + f"{bcolors.ENDC}" + if tag == "USR_UTT": + _segment = f"{bcolors.OKCYAN}" + _segment + f"{bcolors.ENDC}" + if tag in ["SYS_ACT", "USR_ACT", "GOAL"]: + _segment = _segment.replace(" ", f"{bcolors.RED}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"{bcolors.YELLOW}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + _segment = _segment.replace(" ", f"{bcolors.GREEN}") + _segment = _segment.replace(" ", f"{bcolors.ENDC}") + if tag == "GOAL": + _segment = _segment.replace( + "", f"{bcolors.UNDERLINE}" + ) + _segment = _segment.replace("", f"{bcolors.ENDC}") + _segment = _segment.replace("", f"{bcolors.UNDERLINE}") + _segment = _segment.replace("", f"{bcolors.ENDC}") + # if tag in ["SNT", "GC"]: + # segment = segment.replace("<{}/> ".format(tag), "<{}/> *".format(tag)) + # segment = segment.replace(" ".format(tag), "* <{}/>".format(tag)) + return _segment + + assert isinstance(gen, str) + print("*** Dial_id: {} ***".format(dial_id)) + for tag in [ + "CTX", + "SYS_UTT", + "SYS_ACT", + "GOAL", + "SNT", + "RA", + "GC", + "USR_ACT", + "USR_UTT", + ]: + segment = find_segment(gen, tag) + if segment is not None: + print('{} -> "{}"'.format(tag, _color(segment))) + else: + print("Fail to find the segment...") + print("GEN:", gen) + print("---" * 30) + input("press any key to continue...") + + +def save_gen(gen, dial_id, container): + output = {"raw_generation": gen} + parsed_generation = {} + + assert isinstance(gen, str) + for tag in [ + "CTX", + "SYS_UTT", + "SYS_ACT", + "GOAL", + "SNT", + "RA", + "GC", + "USR_ACT", + "USR_UTT", + ]: + segment = find_segment(gen, tag) + if segment is not None: + parsed_generation[tag] = segment + else: + print("Fail to parse generation on example {}".format(dial_id)) + parsed_generation[tag] = None + + output["parsed_generation"] = parsed_generation + container[dial_id] = output + + +# def decode(args, batch, model, tokenizer): +# input_ids = batch['input_ids'] +# batch_size, ctx_len = input_ids.size() +# assert batch_size == 1 +# bos_id, eos_id, pad_id, sep_id = tokenizer.convert_tokens_to_ids(['', '', '', '']) +# +# # output size: (B, T) +# output = model.generate(input_ids, max_length=(ctx_len+args.dec_max_len), do_sample=False, +# temperature=args.temperature, use_cache=True, num_beams=args.num_beams, bos_token_id=bos_id, +# eos_token_id=eos_id, pad_token_id=pad_id, early_stopping=True) +# +# gen = tokenizer.decode(output[0]) # include context fed into model +# segment_gen(gen, batch["example_id"][0]) +# return [gen] + + +def prepare_input_ids( + args: object, tokenizer: object, data: object, start_token: object +) -> object: + assert start_token in ["", ""] + input_seq = "" + for key in [ + "CTX", + "SYS_UTT", + "SYS_ACT", + "SNT", + "RA", + "GC", + "GOAL", + ]: # fixed order, consistent between training and inference + if key not in data: + continue + wrap = wrap_element(key, data[key]) + input_seq = add_str(input_seq, wrap) + + input_seq = add_str(input_seq, start_token) + + input_ids = tokenizer(input_seq)["input_ids"] # convert to ids + input_ids = torch.tensor([input_ids]).long().to(args.device) + return input_ids + + +def decode_e2e( + args, batch, model, tokenizer, user_goal=None, prev_usr_act=None, collector=None +): + """decode with predicted sys act, goal can be random or from the corpus""" + assert len(batch["metadata"]) == 1 + context = batch["metadata"][0]["context"] + sys_utt = batch["metadata"][0]["utterances"]["sys"] + bos_id, _, pad_id, sep_id = tokenizer.convert_tokens_to_ids( + ["", "", "", ""] + ) + + # first forward pass + data = {"CTX": context, "SYS_UTT": sys_utt} + start_token, end_token = "", "" + input_ids = prepare_input_ids(args, tokenizer, data, start_token) + eos_id = tokenizer.convert_tokens_to_ids(end_token) + output = model.generate( + input_ids, + max_length=args.dec_max_len, + do_sample=False, + temperature=args.temperature, + use_cache=True, + num_beams=args.num_beams, + bos_token_id=bos_id, + eos_token_id=eos_id, + pad_token_id=pad_id, + early_stopping=True, + ) + gen = tokenizer.decode(output[0]) # include context fed into model + + # parse the first pass prediction + for key in ["SYS_ACT", "SNT", "GC", "RA"]: + value = find_segment(gen, key) + data[key] = value + # print("***** First run generation *****") + # print("SYS_ACT -> {}".format(data["SYS_ACT"])) + # print("FLAGS -> SNT: {}, GC: {}, RA: {} *****".format(data["SNT"], data["GC"], data["RA"])) + # print("********************************") + + # prepare goal + if user_goal is None: # use ground truth goal from corpus + data["GOAL"] = batch["metadata"][0]["goal"] + else: + goal = user_goal.prepare_turn_goal( + prev_usr_act, data["SYS_ACT"], data["SNT"], data["GC"], data["RA"] + ) + data["GOAL"] = goal + + # second forward pass + start_token, end_token = "", "" + input_ids = prepare_input_ids(args, tokenizer, data, start_token) + eos_id = tokenizer.convert_tokens_to_ids(end_token) + output = model.generate( + input_ids, + max_length=args.dec_max_len, + do_sample=False, + temperature=args.temperature, + use_cache=True, + num_beams=args.num_beams, + bos_token_id=bos_id, + eos_token_id=eos_id, + pad_token_id=pad_id, + early_stopping=True, + ) + gen = tokenizer.decode(output[0]) # include context fed into model + if args.eye_browse_output: + segment_gen(gen, batch["example_id"][0]) + else: + save_gen(gen, batch["example_id"][0], collector) + return [gen] diff --git a/src/crazyneuraluser/user_model_code/utils_multiwoz.py b/src/crazyneuraluser/user_model_code/utils_multiwoz.py new file mode 100644 index 0000000000000000000000000000000000000000..b476ff76db91ab7d7bbdbb6a61432b1b9aa16a96 --- /dev/null +++ b/src/crazyneuraluser/user_model_code/utils_multiwoz.py @@ -0,0 +1,204 @@ +import json +import re + + +def get_original_act_set(): + # NOTE: + # act `Book` and `NoBook` belong to `Booking` domain by ontology, + # they contain information about either `restaurant` or `hotel` domain + # full act vocab: https://github.com/ConvLab/ConvLab/blob/master/data/multiwoz/ \ + # annotation/Multiwoz%20data%20analysis.md#dialog-act + acts = set() + acts.add("Inform") + acts.add("Request") + acts.add( + "NoOffer" + ) # equivalent to the concept of `no matching`, `cannot find` in database + acts.add("Recommend") + acts.add("Select") + acts.add( + "OfferBook" + ) # only for `train` domain, ask if book is needed, equivalent to `Booking-Inform` + # with [[none, none]] args in restaurant/hotel domain + acts.add( + "OfferBooked" + ) # only for `train` domain, inform booking is complete, with corresponding info (such as ref number) + acts.add("Book") # inform booking is successful, equivalent to `OfferBooked` above + acts.add( + "NoBook" + ) # inform booking fails, might because of no availability, usually come together act `request` + acts.add("bye") + acts.add("greet") + acts.add("reqmore") + acts.add("welcome") + acts.add("thank") + return acts + + +def get_act_natural_language(act): + if act in ["bye", "greet", "reqmore", "welcome", "thank"]: + return act + + assert act[0].isupper() + tokens = re.findall("[A-Z][^A-Z]*", act) # e.g., `FindEvents` -> `Find Events` + tokens = list(map(str.lower, tokens)) # lower case, -> `find events` + act_nl = " ".join(tokens) + return act_nl + + +def convert_act_into_sgd(act, SPECIAL_TOKENS): + """ + convert multiwoz acts (w/o domain info) into sgd acts ensure that acts with same concept use one name + e.g., Book (OfferBooked) -> NOTIFY_SUCCESS, NoBook -> NOTIFY_FAILURE + """ + if act == "NoOffer": + act = "NOTIFY_FAILURE" + + elif act == "Recommend": + act = "OFFER" + + # technically, `OfferBook` is equivalent to (`act=OFFER_INTENT, slot=intent, value=ReserveRestaurant`) + # on system side in sgd since (1) the conversion is not trivial (completely different representations) + # and (2) multiwoz has no slot called `intent` + # one cannot simply convert `OfferBook` to `OFFER_INTENT` + # we thus keep the act as is + # note that there is no slot `intent` and value conveying intents in multiwoz + elif act == "OfferBook": + act = "Offer_Book" + + elif act == "OfferBooked": + act = "NOTIFY_SUCCESS" + + elif act == "Book": # same as `OfferBooked` + act = "NOTIFY_SUCCESS" + + elif act == "NoBook": + act = "NOTIFY_FAILURE" + + elif act == "bye": + act = "GOODBYE" + + elif act == "reqmore": + act = "REQ_MORE" + + elif act == "thank": + act = "THANK_YOU" + # elif act == "greet": + # elif act == "welcome": + act = act.upper() # align with sgd acts, e.g., `Inform` -> `INFORM` + + # check if valid + assert "_{}_".format(act) in SPECIAL_TOKENS["additional_special_tokens"] + return act + + +def load_schema(schema_file): + def _update(key, value, mapping): + if key in mapping: + assert ( + value == mapping[key] + ) # ensure service meta is the same between data splits + else: + mapping[key] = value + + def _restructure_service_meta(service_meta, attribute): + """convert slot/intent metadata list into dict(slot/intent=metadata)""" + assert attribute in ["slots", "intents"] + mapping = {} + for value in service_meta[attribute]: + key = value["name"] + if attribute == "slots": # domain-slot in multiwoz + assert "-" in key + _, key = key.split("-") # domain, slot + key = normalise_slot(key) + else: # intent + key = normalise_intent(key) + mapping[key] = value + service_meta[attribute] = mapping + + with open(schema_file) as f: + data = json.load(f) + + SERVICE2META = {} + SLOTS, INTENTS = set(), set() + for service_meta in data: + service = service_meta["service_name"] + _restructure_service_meta(service_meta, "slots") + _restructure_service_meta(service_meta, "intents") + _update(service, service_meta, SERVICE2META) + + # collect domain-independent slots + for slot in service_meta["slots"]: + SLOTS.add(slot) + + for intent in service_meta["intents"]: + INTENTS.add(intent) + + print("Load schema, intents: {}, slots: {}".format(len(INTENTS), len(SLOTS))) + return SERVICE2META, INTENTS, SLOTS + + +def normalise_intent(intent): + """convert intent into natural language, e.g., find_hotel -> find hotel""" + if intent == "police": + intent = "find_police" + if intent == "book_taxi": + intent = "find_taxi" + assert "_" in intent + return " ".join(intent.split("_")) + + +def normalise_slot(slot): + if slot == "pricerange": + return "price range" + + elif slot == "bookday": + return "book day" + + elif slot == "bookpeople": + return "book people" + + elif slot == "booktime": + return "book time" + + elif slot == "bookstay": + return "book stay" + + elif slot == "ref": + return "reference" + + elif slot == "arriveby": + return "arrive by" + + elif slot == "leaveat": + return "leave at" + + elif slot == "trainid": + return "train id" + + elif slot == "openhours": + return "open hours" + + elif slot == "entrancefee": + return "entrance fee" + + elif slot in ["none", "?"]: + return "Empty" + + else: + return slot + + +def normalise_value(value): + # deal with binary and empty values + if value == "yes": + return "True" + + elif value == "no": + return "False" + + elif value in ["none", "?"]: + return "Empty" + + else: + return value diff --git a/src/crazyneuraluser/user_model_code/utils_sgd.py b/src/crazyneuraluser/user_model_code/utils_sgd.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d3d2a40ed5593d6961dd880c025cb268de50bb --- /dev/null +++ b/src/crazyneuraluser/user_model_code/utils_sgd.py @@ -0,0 +1,296 @@ +import json +import re + +""" This file contains utility functions for SGD """ + + +class bcolors: + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKCYAN = "\033[96m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" + + +def wrap_element(content_type, content): + """wrap elements such as slot, value, e.g., slot """ + assert "/" not in content_type + return "<{}/> {} ".format(content_type, content, content_type) + + +def add_str(str1, str2): + return str1 + " " + str2 + + +def list2str(x): + x = sorted(x) + x = ",".join(x) + return "[" + x + "]" + + +def dict2str(x, colored=False): + output = [] + for key, value in x.items(): + if isinstance(value, list): + value = list2str(value) + if colored: + output.append( + f"{bcolors.YELLOW}{key}{bcolors.ENDC}={bcolors.GREEN}{value}{bcolors.ENDC}" + ) + else: + output.append("{}={}".format(key, value)) + return list2str(output) + + +def dict2list(x): + output = [] + for key, value in x.items(): + assert isinstance(key, str) + + if isinstance(value, str): + output.append("{}={}".format(key, value)) + + elif isinstance( + value, list + ): # only 1.8 turns, usually on system side, have multiple values for a slot + # if len(value) > 2: print("************************HIT************************") + for x in value: + assert isinstance(x, str) + output.append("{}={}".format(key, x)) + return sorted(output) + + +def compare_slot_values_in_state(slot_values1, slot_values2): + """return True if value in any intersection slot is different between two states""" + for slot in ( + slot_values1.keys() & slot_values2.keys() + ): # check slots in intersection to see if their value changed + values1 = slot_values1[slot] + values2 = slot_values2[slot] + if ( + len(set(values1) & set(values2)) == 0 + ): # none of values matched between two value lists + return True + return False + + +def get_turn_intent(turn): + """return turn's intent and service""" + frames = turn["frames"] + if len(frames) == 1: + return frames[0]["state"]["active_intent"], frames[0]["service"] + else: + assert len(frames) == 2 + for frame in frames: + for action in frame["actions"]: + if action["act"] == "INFORM_INTENT": + return frame["state"]["active_intent"], frame["service"] + + +def get_turn_act(turn): + """return an `act` list of a turn""" + acts = [] + for frame in turn["frames"]: + for actions in frame["actions"]: + acts.append(actions["act"]) + return acts + + +def get_categorical_slot_span_info(slots): + """ + Inputs: list of dict in `slots` field annotation + """ + slot2info = {} + for slot in slots: + slot_name = slot["slot"] + slot2info[slot_name] = slot + return slot2info + + +def show_turn_meta(turn_meta): + print( + f"intent: {bcolors.RED}{turn_meta.usr_intent}{bcolors.ENDC}, start_new_task: \ + {bcolors.OKBLUE}{turn_meta.start_new_task}{bcolors.ENDC}, goal_change: \ + {bcolors.OKCYAN}{turn_meta.goal_change}{bcolors.ENDC}" + ) + show_constraints(turn_meta.usr_constraints) + print("#####" * 10) + show_linear_data(turn_meta.linear_goal, tag="Goal") + show_linear_data(turn_meta.linear_act["sys"], tag="SYS act") + show_linear_data(turn_meta.linear_act["usr"], tag="USR act") + print("#####" * 10) + print("") + + +def show_constraints(usr_constraints): + info, req = usr_constraints["informable"], usr_constraints["requestable"] + info = dict2str(info, colored=True) + print("informable: {}, requestable: ".format(info), end="") + for x in req: + print(f"{bcolors.YELLOW}{x}{bcolors.ENDC}", end=" ") + print("") + + +def show_linear_data(data, tag): + print("{}: |{}|".format(tag, data)) + + +def load_schema(data_path): + """load schema and return (1) dict {service: service content}, (2) set of intents, and (3) set of slots""" + + def _update(key, value, mapping): + if key in mapping: + assert ( + value == mapping[key] + ) # ensure service meta is the same between data splits + else: + mapping[key] = value + + def _restructure_service_meta(service_meta, attribute): + """convert slot/intent list into dict(name=meta)""" + assert attribute in ["slots", "intents"] + mapping = {} + for value in service_meta[attribute]: + key = value["name"] + mapping[key] = value + service_meta[attribute] = mapping + + SERVICE2META = {} + SLOTS, INTENTS = set(), set() + for split in ["train", "dev", "test"]: + with open("{}/{}/schema.json".format(data_path, split)) as f: + data = json.load(f) + + for service_meta in data: + service = service_meta["service_name"] + _restructure_service_meta(service_meta, "slots") + _restructure_service_meta(service_meta, "intents") + _update(service, service_meta, SERVICE2META) + + for slot in service_meta["slots"]: + SLOTS.add(slot) + for intent in service_meta["intents"]: + INTENTS.add(intent) + # NOTE: the slot/intent existing in different services have different meta data, e.g., FindBus, event_name + print("Load schema, intents: {}, slots: {}".format(len(INTENTS), len(SLOTS))) + return SERVICE2META, INTENTS, SLOTS + + +def get_special_tokens(): + """get pre-defined special tokens""" + SPECIAL_TOKENS = { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "sep_token": "", + "additional_special_tokens": [], + } + + # ctx + SPECIAL_TOKENS["additional_special_tokens"] += ["", "", ""] + SPECIAL_TOKENS["additional_special_tokens"] += ["", "", ""] + + # current turn utterance + SPECIAL_TOKENS["additional_special_tokens"] += ["", ""] + SPECIAL_TOKENS["additional_special_tokens"] += ["", ""] + + # current turn action + SPECIAL_TOKENS["additional_special_tokens"] += ["", ""] + SPECIAL_TOKENS["additional_special_tokens"] += ["", ""] + + # elements segment + SPECIAL_TOKENS["additional_special_tokens"] += ["", "", ""] + SPECIAL_TOKENS["additional_special_tokens"] += ["", "", ""] + + # goal (`task` is `intent` in SGD) + SPECIAL_TOKENS["additional_special_tokens"] += [ + "", + "", + "", + "", + "", + "", + ] + SPECIAL_TOKENS["additional_special_tokens"] += [ + "", + "", + "", + "", + "", + "", + ] + + # sgd act + SPECIAL_TOKENS["additional_special_tokens"] += [ + "_INFORM_", + "_REQUEST_", + "_CONFIRM_", + "_OFFER_", + "_NOTIFY_SUCCESS_", + "_NOTIFY_FAILURE_", + "_INFORM_COUNT_", + "_OFFER_INTENT_", + "_REQ_MORE_", + "_GOODBYE_", + "_INFORM_INTENT_", + "_NEGATE_INTENT_", + "_AFFIRM_INTENT_", + "_AFFIRM_", + "_NEGATE_", + "_SELECT_", + "_REQUEST_ALTS_", + "_THANK_YOU_", + ] + + # multiwoz act, distinct from sgd + # SPECIAL_TOKENS["additional_special_tokens"] += ["_RECOMMEND_", "_OFFER_BOOK_", "_GREET_", "_WELCOME_"] + SPECIAL_TOKENS["additional_special_tokens"] += [ + "_OFFER_BOOK_", + "_GREET_", + "_WELCOME_", + ] + + # intent + SPECIAL_TOKENS["additional_special_tokens"] += [""] + SPECIAL_TOKENS["additional_special_tokens"] += [""] + + # special values: True, False, Empty + SPECIAL_TOKENS["additional_special_tokens"] += ["_True_", "_False_", "_Empty_"] + # NOTE: slots, values and a special value "dontcare" are all presented in natural language + + # special flags + # SNT: Start New Task + # GC: Goal Change + # RA: Request Alternative + SPECIAL_TOKENS["additional_special_tokens"] += ["", "", ""] + SPECIAL_TOKENS["additional_special_tokens"] += ["", "", ""] + + print( + "Load special tokens: {}".format( + len(SPECIAL_TOKENS["additional_special_tokens"]) + 4 + ) + ) + return SPECIAL_TOKENS + + +def split_intent(intent): + """ + convert intent special token into a natural language (nl) form + e.g., `FindEvents` -> `find events` + """ + assert intent[0].isupper() + tokens = re.findall("[A-Z][^A-Z]*", intent) # e.g., `FindEvents` -> `Find Events` + tokens = list(map(str.lower, tokens)) # lower case, -> `find events` + intent_nl = " ".join(tokens) + return intent_nl + + +def conv_special_token(token, SPECIAL_TOKENS): + assert token[0] != "_" and token[-1] != "_" + token = "_{}_".format(token) + assert token in SPECIAL_TOKENS["additional_special_tokens"] + return token