|
""" |
|
Extend the given vocabulary using dataset-specific words. |
|
|
|
1. First create a vocabulary for the specific dataset. |
|
2. Find all words not in our vocabulary, but in the dataset vocabulary. |
|
3. Take top X (default=1000) of these words and add them to the vocabulary. |
|
4. Save this combined vocabulary and embedding matrix, which can now be used. |
|
""" |
|
|
|
from __future__ import print_function, unicode_literals |
|
import example_helper |
|
import json |
|
from torchmoji.create_vocab import extend_vocab, VocabBuilder |
|
from torchmoji.word_generator import WordGenerator |
|
|
|
new_words = ['#zzzzaaazzz', 'newword', 'newword'] |
|
word_gen = WordGenerator(new_words) |
|
vb = VocabBuilder(word_gen) |
|
vb.count_all_words() |
|
|
|
with open('../model/vocabulary.json') as f: |
|
vocab = json.load(f) |
|
|
|
print(len(vocab)) |
|
print(vb.word_counts) |
|
extend_vocab(vocab, vb, max_tokens=1) |
|
|
|
|
|
print(vocab['newword']) |
|
print(len(vocab)) |
|
|