import numpy as np import argparse import random path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5" def read_vectors(path, topn=0): # read top n word vectors, i.e. top is 10000 lines_num = 0 vectors = [] iw = [] with open(path, encoding='utf-8', errors='ignore') as f: first_line = True for line in f: if first_line: first_line = False dim = int(line.rstrip().split()[1]) continue lines_num += 1 tokens = line.rstrip().split(' ') vectors.append([float(x) for x in tokens[1:]]) iw.append(tokens[0]) if topn != 0 and lines_num >= topn: break return np.array(vectors), np.array(iw) def main(): vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5" # embedding_matrix, word_list = read_vectors(vectors_path) # np.save("ZHglove.wordlist.npy", word_list) # np.save("ZHglove.300d.mat.npy", embedding_matrix) embedding_matrix = np.load("ZHglove.300d.mat.npy") word_list = np.load("ZHglove.wordlist.npy") print(embedding_matrix.shape) print(word_list.shape) word2id = {} if embedding_matrix is not None: words = [] words_id = [] for i, word in enumerate(word_list): if word in word2id: words.append(word) words_id.append(i) # assert word not in word2id, "Duplicate words in pre-trained embeddings" word2id[word] = len(word2id) embedding_matrix = np.delete(embedding_matrix, words_id, 0) print(embedding_matrix.shape) word_list = np.delete(word_list, words_id, 0) np.save("ZHglove.wordlist.npy", word_list) np.save("ZHglove.300d.mat.npy", embedding_matrix) print(word_list.shape) if __name__ == "__main__": main()