|
from transformers import BertTokenizerFast, BertConfig |
|
from typing import Dict, List, Union, Tuple |
|
|
|
|
|
def num_unique_labels(dataset: Dict[str, Union[str, List[str]]]) -> Tuple[int, int]: |
|
""" |
|
Calculate the number of NER labels and INTENT labels in the dataset. |
|
|
|
Args: |
|
dataset (dict): A dictionary containing 'text', 'entities' and 'intent' keys. |
|
|
|
Returns: |
|
Tuple: Number of unique NER and INTENT lables. |
|
""" |
|
one_dimensional_ner = [tag for subset in dataset['entities'] for tag in subset] |
|
return len(set(one_dimensional_ner)), len(set(dataset['intent'])) |
|
|
|
def ner_labels_to_ids() -> Dict[str, int]: |
|
""" |
|
Map NER labels to corresponding numeric IDs. |
|
|
|
Returns: |
|
Dict[str, int]: A dictionary where keys are NER labels, and values are their corresponding IDs. |
|
""" |
|
labels_to_ids_ner = { |
|
'O': 0, |
|
'B-DATE': 1, |
|
'I-DATE': 2, |
|
'B-TIME': 3, |
|
'I-TIME': 4, |
|
'B-TASK': 5, |
|
'I-TASK': 6, |
|
'B-DUR': 7, |
|
'I-DUR': 8 |
|
} |
|
return labels_to_ids_ner |
|
|
|
def ner_ids_to_labels(ner_labels_to_ids) -> Dict[int, str]: |
|
""" |
|
Map numeric IDs to corresponding NER labels. |
|
|
|
Returns: |
|
Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding NER labels. |
|
""" |
|
ner_ids_to_labels = {v: k for k, v in ner_labels_to_ids.items()} |
|
return ner_ids_to_labels |
|
|
|
def intent_labels_to_ids() -> Dict[str, int]: |
|
""" |
|
Map intent labels to corresponding numeric values. |
|
|
|
Returns: |
|
Dict[str, int]: A dictionary where keys are intent labels, and values are their corresponding numeric IDs. |
|
""" |
|
intent_labels_to_ids = { |
|
"'Schedule Appointment'": 0, |
|
"'Schedule Meeting'": 1, |
|
"'Set Alarm'": 2, |
|
"'Set Reminder'": 3, |
|
"'Set Timer'": 4 |
|
} |
|
return intent_labels_to_ids |
|
|
|
def intent_ids_to_labels(intent_labels_to_ids) -> Dict[int, str]: |
|
""" |
|
Map numeric values to corresponding intent labels. |
|
|
|
Returns: |
|
Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding intent labels. |
|
""" |
|
intent_ids_to_labels = {v: k for k, v in intent_labels_to_ids.items()} |
|
return intent_ids_to_labels |
|
|
|
def tokenizer() -> BertTokenizerFast: |
|
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') |
|
return tokenizer |
|
|
|
def bert_config() -> BertConfig: |
|
config = BertConfig.from_pretrained('bert-base-uncased') |
|
return config |
|
|
|
def structure_data(dataset): |
|
structured_data = {'text': [], 'entities': [], 'intent': []} |
|
for sample in dataset: |
|
structured_data['text'].append(sample['text']) |
|
structured_data['entities'].append(sample['entities'].split()) |
|
structured_data['intent'].append(sample['intent']) |
|
return structured_data |