File size: 1,065 Bytes
80d096b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import json

import numpy as np


def save_json_dict(dict_info, file_name):
	s = json.dumps(dict_info)
	open(file_name,"w").write(s)


def read_file(path):
	all_freq_embed={}
	word_to_id = {}
	current_word_id = 0
	for line in open(path):
		s = line.strip().split()
		word = s[0]
		word_to_id[word]=current_word_id
		current_word_id+=1

		all_freq_embed[s[0]] = np.array([float(i) for i in s[1:]])
	
	word_to_id["UNK"]=current_word_id
	current_word_id+=1
	word_to_id["***PADDING***"]=current_word_id

	save_json_dict(word_to_id,"word_to_id.json")
	
	print(len(word_to_id))
	freq_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), 102))
	for w in word_to_id:
		if w in all_freq_embed:
			freq_embeds[word_to_id[w]] = all_freq_embed[w]
		elif w.lower() in all_freq_embed:
			freq_embeds[word_to_id[w]] = all_freq_embed[w.lower()]

	np.save('freq_embeds.npy',  freq_embeds) 

if __name__ == '__main__':
	path = "Freq_Vector.txt"
	read_file(path)
	word_to_id = json.load(open("word_to_id.json","r"))
	print(len(word_to_id))
	# print(word_to_id)