from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification from datasets import DatasetDict, Dataset import json def preprocess_data1(json_path, tokenizer): with open(json_path, "r") as f: data = json.load(f)["data"] tokenized_data = {"input_ids": [], "attention_mask": [], "labels": []} slot_label_map = {"O": 0} label_id = 1 for intent_data in data: for utterance in intent_data["utterances"]: text = utterance["text"] encoding = tokenizer( text, truncation=True, padding="max_length", max_length=128, return_offsets_mapping=True ) tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"]) # Create slot labels for the tokens slot_labels = ["O"] * len(tokens) for slot, value in utterance["slots"].items(): if value != "not specified": # Skip unspecified slots slot_tokens = tokenizer.tokenize(value) for i in range(len(tokens) - len(slot_tokens) + 1): if tokens[i:i + len(slot_tokens)] == slot_tokens: slot_labels[i] = f"B-{slot}" for j in range(1, len(slot_tokens)): slot_labels[i + j] = f"I-{slot}" # Map slot labels to IDs for label in slot_labels: if label not in slot_label_map: slot_label_map[label] = label_id label_id += 1 label_ids = [slot_label_map[label] for label in slot_labels] tokenized_data["input_ids"].append(encoding["input_ids"]) tokenized_data["attention_mask"].append(encoding["attention_mask"]) tokenized_data["labels"].append(label_ids) print("Slot Label Map:", slot_label_map) dataset = Dataset.from_dict(tokenized_data) return DatasetDict({"train": dataset, "validation": dataset}), slot_label_map # Update training preprocessing to handle multi-token amount def preprocess_data(json_path, tokenizer): with open(json_path, "r") as f: data = json.load(f)["data"] tokenized_data = {"input_ids": [], "attention_mask": [], "labels": []} slot_label_map = {"O": 0} for intent_data in data: for utterance in intent_data["utterances"]: text = utterance["text"] encoding = tokenizer( text, truncation=True, padding="max_length", max_length=128, return_offsets_mapping=True ) tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"]) slot_labels = ["O"] * len(tokens) for slot, value in utterance["slots"].items(): if value != "not specified": slot_tokens = tokenizer.tokenize(value) for i in range(len(tokens) - len(slot_tokens) + 1): if tokens[i:i + len(slot_tokens)] == slot_tokens: slot_labels[i] = f"B-{slot}" for j in range(1, len(slot_tokens)): slot_labels[i + j] = f"I-{slot}" # Map slot labels to IDs for label in slot_labels: if label not in slot_label_map: slot_label_map[label] = label_id label_id += 1 label_ids = [slot_label_map[label] for label in slot_labels] tokenized_data["input_ids"].append(encoding["input_ids"]) tokenized_data["attention_mask"].append(encoding["attention_mask"]) tokenized_data["labels"].append(label_ids) dataset = Dataset.from_dict(tokenized_data) return DatasetDict({"train": dataset, "validation": dataset}), slot_label_map tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased") json_path = "nlu_dataset.json" dataset, slot_label_map = preprocess_data(json_path, tokenizer) model = BertForTokenClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=len(slot_label_map) ) data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) training_args = TrainingArguments( output_dir="./results", num_train_epochs=100, per_device_train_batch_size=16, per_device_eval_batch_size=16, evaluation_strategy="epoch", logging_dir="./logs", ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"], eval_dataset=dataset["validation"], data_collator=data_collator ) trainer.train()