ddiddu commited on
Commit
2276e74
·
1 Parent(s): e3a8e5e

update app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -4
app.py CHANGED
@@ -1,7 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
1
+ # -*- coding: utf-8 -*-
2
+ """preprocess.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1cs4nwWCMLuQOS1f9V6xbAG1kKtasrPs2
8
+
9
+ # Preprocessing
10
+ """
11
+
12
+ # from google.colab import drive
13
+ # drive.mount('/content/drive')
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ import json
18
+ from itertools import islice # for slicing and dicing JSON records
19
+ import os
20
+
21
+ # def get_data(json_filename = 'arxiv-metadata-oai-snapshot.json', data_root = '/content/drive/MyDrive/카이스트/23봄/CS372/project'):
22
+ # with open(data_root + '/' + 'dataset' + '/' + json_filename, "rb") as f:
23
+ # for line in f:
24
+ # yield line
25
+ def get_data(json_filename='arxiv-metadata-oai-snapshot.json'):
26
+ script_directory = os.path.dirname(os.path.abspath(__file__))
27
+ json_path = os.path.join(script_directory, json_filename)
28
+
29
+ with open(json_path, "rb") as f:
30
+ for line in f:
31
+ yield line
32
+
33
+ data_gen = get_data()
34
+
35
+ def get_records(data_gen, chunksize=500):
36
+ return [json.loads(record) for record in islice(data_gen, chunksize)]
37
+
38
+ records_per_chunk = 250000
39
+ data_records = get_records(data_gen, records_per_chunk)
40
+
41
+ def split_records(data_records, num_profiles=100, random_state=42):
42
+ np.random.seed(random_state)
43
+ np.random.shuffle(data_records)
44
+ train_records, test_records = data_records[:num_profiles], data_records[num_profiles:]
45
+ return train_records, test_records
46
+
47
+ # Splitting the fetched records into train and test records
48
+ train_records, test_records = split_records(data_records, num_profiles=500)
49
+
50
+ # Utility method to generate dataframe from list of dictionaries
51
+ def get_dataframe(list_of_dicts, columns=None):
52
+ data = pd.DataFrame(list_of_dicts)
53
+ if columns:
54
+ data.columns = columns
55
+ return data
56
+
57
+ # Generating dataframes for train and test records
58
+ train_df = get_dataframe(train_records)
59
+ test_df = get_dataframe(test_records)
60
+
61
+ # Utility method to filter out certain features which are of use
62
+ def filter_features(data, features):
63
+ return data[features]
64
+
65
+ # Filtering the test dataframes for features we selected
66
+ features = ['title', 'categories', 'abstract', 'update_date']
67
+ train_df = filter_features(train_df, features)
68
+ test_df = filter_features(test_df, features)
69
+
70
+ # define the corpus to pull from
71
+ train_corpus = train_df['abstract'].head(10000)
72
+ test_corpus = test_df['abstract'].head(10000)
73
+
74
+ train_df.head()
75
+
76
+ test_df.head()
77
+
78
+ # train_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/train_df.csv', index = False)
79
+ # test_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/test_df.csv', index = False)
80
+
81
+ """## Removing unnecessary words"""
82
+
83
+ import nltk
84
+ nltk.download('book')
85
+ from nltk.book import *
86
+ from nltk.tokenize import sent_tokenize, word_tokenize
87
+
88
+ def tokenize_POS(paragraph):
89
+ words = word_tokenize(paragraph)
90
+ tagged_words = nltk.pos_tag(words)
91
+
92
+ # Remove not important types of words
93
+ excluded_tags = ['CC', 'DT', 'IN', 'TO', 'PRP', 'PRP$', 'MD', 'WP', 'WP$', 'WRB']
94
+ filtered_words = [word for word, pos in tagged_words if pos not in excluded_tags]
95
+
96
+ return ' '.join(filtered_words)
97
+
98
+ # define the corpus to pull from
99
+ train_corpus = train_df['abstract'].head(10000)
100
+ test_corpus = test_df['abstract'].head(10000)
101
+
102
+ train_corpus[0]
103
+
104
+ train_corpus = pd.Series([tokenize_POS(abstract) for abstract in train_corpus])
105
+
106
+ train_corpus[0]
107
+
108
+ train_df_doc = train_df
109
+
110
+ train_df_doc['abstract'] = train_df_doc['abstract'].apply(tokenize_POS)
111
+
112
+ """# TF-IDF"""
113
+
114
+ import time # for getting the runtime of cells
115
+ from sklearn.feature_extraction.text import TfidfVectorizer # for building word representations
116
+ from sklearn.metrics.pairwise import cosine_similarity # for getting similarity metrics
117
+
118
+ def get_recommendations_TFIDF(abstract):
119
+ abstract = tokenize_POS(abstract)
120
+ corpus = pd.concat([train_corpus, pd.Series(abstract)], ignore_index=True)
121
+
122
+ # Initialize an instance of tf-idf Vectorizer
123
+ tfidf_vectorizer = TfidfVectorizer()
124
+ # Generate the tf-idf vectors for the corpus
125
+ tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
126
+ # compute and print the cosine similarity matrix
127
+ cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
128
+
129
+ # Get the pairwise similarity scores
130
+ sim_scores = list(enumerate(cosine_sim[-1]))
131
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
132
+ paper_indices = sim_scores[2][0]
133
+
134
+ title = train_df['title'].iloc[paper_indices]
135
+ categories = train_df['categories'].iloc[paper_indices]
136
+ abstract = train_df['abstract'].iloc[paper_indices]
137
+ similarity = "{:.2f}%".format(sim_scores[2][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
138
+ return title, categories, abstract, similarity
139
+
140
+ get_recommendations_TFIDF('''
141
+ In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n
142
+ ''')
143
+
144
+ """# Doc2Vec"""
145
+
146
+ import time
147
+ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
148
+ from sklearn.metrics.pairwise import cosine_similarity
149
+
150
+ def train_doc2vec_model(corpus, vector_size=100, window=5, min_count=1, epochs=100):
151
+ # create tagged document object
152
+ tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(corpus)] # words=word_tokenize(doc.lower())
153
+
154
+ # initialize doc2vec model
155
+ model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
156
+
157
+ # build vocabulary
158
+ model.build_vocab(tagged_data)
159
+
160
+ # train doc2vec model
161
+ model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
162
+ return model
163
+
164
+ doc2vec_model = train_doc2vec_model(train_corpus)
165
+
166
+ def get_recommendations_Doc2Vec(abstract):
167
+ # remove unnecessary words
168
+ abstract = tokenize_POS(abstract)
169
+ # train the model
170
+ model = doc2vec_model
171
+
172
+ # infer the vector for the given abstract
173
+ abstract_vector = model.infer_vector(abstract.split())
174
+
175
+ # get the most similar abstract
176
+ most_similar = model.dv.most_similar([abstract_vector], topn=2)
177
+
178
+ # Check if the first result is the input abstract
179
+ if train_df_doc['abstract'].iloc[int(most_similar[0][0])].split() == abstract.split():
180
+ # print('True')
181
+ paper_indices = int(most_similar[1][0])
182
+ similarity = "{:.2f}%".format(most_similar[1][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
183
+ else:
184
+ # print('False')
185
+ paper_indices = int(most_similar[0][0])
186
+ similarity = "{:.2f}%".format(most_similar[0][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
187
+
188
+ # Retrieve the details of the most similar abstract
189
+ title = train_df['title'].iloc[paper_indices]
190
+ categories = train_df['categories'].iloc[paper_indices]
191
+ abstract = train_df['abstract'].iloc[paper_indices]
192
+ return title, categories, abstract, similarity
193
+
194
+ """# Deploy"""
195
+
196
+ # !pip install gradio
197
+
198
  import gradio as gr
199
 
200
+ def greet(paper):
201
+ title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf = get_recommendations_TFIDF(paper)
202
+ title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec = get_recommendations_Doc2Vec(paper)
203
+ return title_tfidf, categories_tfidf, abstract_tfidf, title_doc2vec, categories_doc2vec, abstract_doc2vec
204
+
205
+ title = '''SimSearch:\n
206
+ A Similarity Search Tool for Research Paper Abstracts using NLP'''
207
+
208
+ demo = gr.Interface(
209
+ title = title,
210
+ fn = greet,
211
+ inputs=gr.Textbox(placeholder='Abstract Here'),
212
+ outputs=[gr.outputs.Textbox(label='''TFIDF Title'''),
213
+ gr.outputs.Textbox(label='TFIDF Categories'),
214
+ gr.outputs.Textbox(label='TFIDF Abstract'),
215
+ # gr.outputs.Textbox(label='Similarity')
216
+ gr.outputs.Textbox(label='''Doc2Vec Title'''),
217
+ gr.outputs.Textbox(label='Doc2Vec Categories'),
218
+ gr.outputs.Textbox(label='Doc2Vec Abstract'),
219
+ ],
220
+ examples = [
221
+ # ['''In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n'''],
222
+ ['''A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events.'''],
223
+ ['''Most physical experiments are usually described repeated measurements random variables . experimental data registered on-line computers form time series outcomes . frequencies different outcomes are compared probabilities provided algorithms quantum theory ( QT ) . spite statistical predictions QT claim was made theory provided most complete description data underlying physical phenomena . claim be easily rejected fine structures , averaged out standard statistical descriptive analysis , were found time series experimental data . search structures one has use more subtle statistical tools which were developed study time series produced various stochastic processes . talk review tools . example show standard descriptive statistical analysis data is unable reveal fine structure simulated sample AR ( 2 ) stochastic process . emphasize once again violation Bell inequalities gives information completeness non locality QT . appropriate way test completeness quantum theory is search fine structures time series experimental data means purity tests studying autocorrelation partial autocorrelation functions .''']]
224
+ )
225
 
226
+ demo.launch(share=True, debug=True)