Spaces:
Build error
Build error
model
Browse files
Word2vec/data_problem_corpus/problem_corpus_sample_cleaned.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:194deaf2b057e3eb519ffe122c6b7f79544d6b2a1de339555e410b029174b0b6
|
3 |
+
size 234347529
|
Word2vec/run.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# @File : test_sentence_similarity.py
|
4 |
+
# @Author: nixin
|
5 |
+
# @Date : 2019-03-06
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
from scipy import spatial
|
9 |
+
from gensim.models import word2vec
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
# load the trained word vector model
|
15 |
+
model = word2vec.Word2Vec.load('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/trained_word2vec.model')
|
16 |
+
index2word_set = set(model.wv.index2word)
|
17 |
+
|
18 |
+
def avg_feature_vector(sentence, model, num_features, index2word_set):
|
19 |
+
words = sentence.split()
|
20 |
+
feature_vec = np.zeros((num_features, ), dtype='float32')
|
21 |
+
n_words = 0
|
22 |
+
for word in words:
|
23 |
+
if word in index2word_set:
|
24 |
+
n_words += 1
|
25 |
+
feature_vec = np.add(feature_vec, model[word])
|
26 |
+
if (n_words > 0):
|
27 |
+
feature_vec = np.divide(feature_vec, n_words)
|
28 |
+
return feature_vec
|
29 |
+
|
30 |
+
#read problem file
|
31 |
+
problem_corpus = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/data_problem_corpus/problem_corpus_sample_cleaned.csv')
|
32 |
+
problem_corpus = problem_corpus.head(100)
|
33 |
+
|
34 |
+
target_problem = 'strategic cleavage of such a target rna will destroy its ability to direct synthesis of an encoded protein'
|
35 |
+
target_domain = 'A'
|
36 |
+
|
37 |
+
# remove the same domain's problems
|
38 |
+
problem_corpus = problem_corpus[problem_corpus.Domain != 'A']
|
39 |
+
|
40 |
+
|
41 |
+
# choose the time range
|
42 |
+
problem_corpus = problem_corpus[problem_corpus['publication_year'].between(2015, 2017)]
|
43 |
+
|
44 |
+
|
45 |
+
value=[]
|
46 |
+
for each_problem in problem_corpus['First part Contradiction']:
|
47 |
+
s1_afv = avg_feature_vector(target_problem, model=model, num_features=100, index2word_set=index2word_set)
|
48 |
+
s2_afv = avg_feature_vector(each_problem, model=model, num_features=100, index2word_set=index2word_set)
|
49 |
+
sim_value = format( 1 - spatial.distance.cosine(s1_afv, s2_afv), '.2f')
|
50 |
+
value.append(sim_value)
|
51 |
+
|
52 |
+
problem_corpus[['similarity_value', 'target_problem']] = value, target_problem
|
53 |
+
|
54 |
+
print(problem_corpus)
|
55 |
+
|
56 |
+
# set similarity threshold
|
57 |
+
problem_corpus_final = problem_corpus[problem_corpus.similarity_value>= '0.8']
|
58 |
+
# print(problem_corpus.columns())
|
59 |
+
|
60 |
+
problem_corpus_final.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/simialrity_result/test.csv', index=False)
|
61 |
+
print(problem_corpus_final)
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
Word2vec/simialrity_result/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f100a1f9f61956bb4e97d177bc48b581c1ab4a925215c43d1cf9f8e590070774
|
3 |
+
size 2601
|
Word2vec/trained_word2vec.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3546e4a57f7c76e9272566c43311dcebe354a3a968ea70b3f3a3b6d55c8f5977
|
3 |
+
size 147031792
|