File size: 1,065 Bytes
80d096b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import json
import numpy as np
def save_json_dict(dict_info, file_name):
s = json.dumps(dict_info)
open(file_name,"w").write(s)
def read_file(path):
all_freq_embed={}
word_to_id = {}
current_word_id = 0
for line in open(path):
s = line.strip().split()
word = s[0]
word_to_id[word]=current_word_id
current_word_id+=1
all_freq_embed[s[0]] = np.array([float(i) for i in s[1:]])
word_to_id["UNK"]=current_word_id
current_word_id+=1
word_to_id["***PADDING***"]=current_word_id
save_json_dict(word_to_id,"word_to_id.json")
print(len(word_to_id))
freq_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), 102))
for w in word_to_id:
if w in all_freq_embed:
freq_embeds[word_to_id[w]] = all_freq_embed[w]
elif w.lower() in all_freq_embed:
freq_embeds[word_to_id[w]] = all_freq_embed[w.lower()]
np.save('freq_embeds.npy', freq_embeds)
if __name__ == '__main__':
path = "Freq_Vector.txt"
read_file(path)
word_to_id = json.load(open("word_to_id.json","r"))
print(len(word_to_id))
# print(word_to_id)
|