svjack's picture
Upload . with huggingface_hub
b5dbcf3
raw
history blame
1.61 kB
import os
def vocab_process(data_dir):
slot_label_vocab = 'slot_label.txt'
intent_label_vocab = 'intent_label.txt'
train_dir = os.path.join(data_dir, 'train')
# intent
with open(os.path.join(train_dir, 'label'), 'r', encoding='utf-8') as f_r, open(os.path.join(data_dir, intent_label_vocab), 'w',
encoding='utf-8') as f_w:
intent_vocab = set()
for line in f_r:
line = line.strip()
intent_vocab.add(line)
additional_tokens = ["UNK"]
for token in additional_tokens:
f_w.write(token + '\n')
intent_vocab = sorted(list(intent_vocab))
for intent in intent_vocab:
f_w.write(intent + '\n')
# slot
with open(os.path.join(train_dir, 'seq.out'), 'r', encoding='utf-8') as f_r, open(os.path.join(data_dir, slot_label_vocab), 'w',
encoding='utf-8') as f_w:
slot_vocab = set()
for line in f_r:
line = line.strip()
slots = line.split()
for slot in slots:
slot_vocab.add(slot)
slot_vocab = sorted(list(slot_vocab), key=lambda x: (x[2:], x[:2]))
# Write additional tokens
additional_tokens = ["PAD", "UNK"]
for token in additional_tokens:
f_w.write(token + '\n')
for slot in slot_vocab:
f_w.write(slot + '\n')
if __name__ == "__main__":
vocab_process('atis')
vocab_process('snips')