StackOverflowNER / utils_fine_tune /create_freq_vector_info.py

philippeitis

Add complete utils_fine_tune

80d096b almost 3 years ago

1.07 kB

	import json

	import numpy as np


	def save_json_dict(dict_info, file_name):
	s = json.dumps(dict_info)
	open(file_name,"w").write(s)


	def read_file(path):
	all_freq_embed={}
	word_to_id = {}
	current_word_id = 0
	for line in open(path):
	s = line.strip().split()
	word = s[0]
	word_to_id[word]=current_word_id
	current_word_id+=1

	all_freq_embed[s[0]] = np.array([float(i) for i in s[1:]])

	word_to_id["UNK"]=current_word_id
	current_word_id+=1
	word_to_id["*PADDING*"]=current_word_id

	save_json_dict(word_to_id,"word_to_id.json")

	print(len(word_to_id))
	freq_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), 102))
	for w in word_to_id:
	if w in all_freq_embed:
	freq_embeds[word_to_id[w]] = all_freq_embed[w]
	elif w.lower() in all_freq_embed:
	freq_embeds[word_to_id[w]] = all_freq_embed[w.lower()]

	np.save('freq_embeds.npy', freq_embeds)

	if __name__ == '__main__':
	path = "Freq_Vector.txt"
	read_file(path)
	word_to_id = json.load(open("word_to_id.json","r"))
	print(len(word_to_id))
	# print(word_to_id)