xin commited on
Commit
74555b0
1 Parent(s): df3f782
Word2vec/data_problem_corpus/problem_corpus_sample_cleaned.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:194deaf2b057e3eb519ffe122c6b7f79544d6b2a1de339555e410b029174b0b6
3
+ size 234347529
Word2vec/run.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @File : test_sentence_similarity.py
4
+ # @Author: nixin
5
+ # @Date : 2019-03-06
6
+
7
+ import numpy as np
8
+ from scipy import spatial
9
+ from gensim.models import word2vec
10
+ import pandas as pd
11
+
12
+
13
+
14
+ # load the trained word vector model
15
+ model = word2vec.Word2Vec.load('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/trained_word2vec.model')
16
+ index2word_set = set(model.wv.index2word)
17
+
18
+ def avg_feature_vector(sentence, model, num_features, index2word_set):
19
+ words = sentence.split()
20
+ feature_vec = np.zeros((num_features, ), dtype='float32')
21
+ n_words = 0
22
+ for word in words:
23
+ if word in index2word_set:
24
+ n_words += 1
25
+ feature_vec = np.add(feature_vec, model[word])
26
+ if (n_words > 0):
27
+ feature_vec = np.divide(feature_vec, n_words)
28
+ return feature_vec
29
+
30
+ #read problem file
31
+ problem_corpus = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/data_problem_corpus/problem_corpus_sample_cleaned.csv')
32
+ problem_corpus = problem_corpus.head(100)
33
+
34
+ target_problem = 'strategic cleavage of such a target rna will destroy its ability to direct synthesis of an encoded protein'
35
+ target_domain = 'A'
36
+
37
+ # remove the same domain's problems
38
+ problem_corpus = problem_corpus[problem_corpus.Domain != 'A']
39
+
40
+
41
+ # choose the time range
42
+ problem_corpus = problem_corpus[problem_corpus['publication_year'].between(2015, 2017)]
43
+
44
+
45
+ value=[]
46
+ for each_problem in problem_corpus['First part Contradiction']:
47
+ s1_afv = avg_feature_vector(target_problem, model=model, num_features=100, index2word_set=index2word_set)
48
+ s2_afv = avg_feature_vector(each_problem, model=model, num_features=100, index2word_set=index2word_set)
49
+ sim_value = format( 1 - spatial.distance.cosine(s1_afv, s2_afv), '.2f')
50
+ value.append(sim_value)
51
+
52
+ problem_corpus[['similarity_value', 'target_problem']] = value, target_problem
53
+
54
+ print(problem_corpus)
55
+
56
+ # set similarity threshold
57
+ problem_corpus_final = problem_corpus[problem_corpus.similarity_value>= '0.8']
58
+ # print(problem_corpus.columns())
59
+
60
+ problem_corpus_final.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/simialrity_result/test.csv', index=False)
61
+ print(problem_corpus_final)
62
+
63
+
64
+
65
+
66
+
67
+
Word2vec/simialrity_result/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f100a1f9f61956bb4e97d177bc48b581c1ab4a925215c43d1cf9f8e590070774
3
+ size 2601
Word2vec/trained_word2vec.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3546e4a57f7c76e9272566c43311dcebe354a3a968ea70b3f3a3b6d55c8f5977
3
+ size 147031792