Spaces:

ddiddu
/

simsearch

Runtime error

App Files Files Community

ddiddu commited on May 31, 2023

Commit

2276e74

1 Parent(s): e3a8e5e

update app.py

Browse files

Files changed (1) hide show

app.py +223 -4

app.py CHANGED Viewed

@@ -1,7 +1,226 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+# -*- coding: utf-8 -*-
+"""preprocess.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1cs4nwWCMLuQOS1f9V6xbAG1kKtasrPs2
+# Preprocessing
+"""
+# from google.colab import drive
+# drive.mount('/content/drive')
+import numpy as np
+import pandas as pd
+import json
+from itertools import islice # for slicing and dicing JSON records
+import os
+# def get_data(json_filename = 'arxiv-metadata-oai-snapshot.json', data_root = '/content/drive/MyDrive/카이스트/23봄/CS372/project'):
+#   with open(data_root + '/' + 'dataset' + '/' + json_filename, "rb") as f:
+#     for line in f:
+#       yield line
+def get_data(json_filename='arxiv-metadata-oai-snapshot.json'):
+    script_directory = os.path.dirname(os.path.abspath(__file__))
+    json_path = os.path.join(script_directory, json_filename)
+    with open(json_path, "rb") as f:
+        for line in f:
+            yield line
+data_gen = get_data()
+def get_records(data_gen, chunksize=500):
+    return [json.loads(record) for record in islice(data_gen, chunksize)]
+records_per_chunk = 250000
+data_records = get_records(data_gen, records_per_chunk)
+def split_records(data_records, num_profiles=100, random_state=42):
+    np.random.seed(random_state)
+    np.random.shuffle(data_records)
+    train_records, test_records = data_records[:num_profiles], data_records[num_profiles:]
+    return train_records, test_records
+# Splitting the fetched records into train and test records
+train_records, test_records = split_records(data_records, num_profiles=500)
+# Utility method to generate dataframe from list of dictionaries
+def get_dataframe(list_of_dicts, columns=None):
+    data = pd.DataFrame(list_of_dicts)
+    if columns:
+        data.columns = columns
+    return data
+# Generating dataframes for train and test records
+train_df = get_dataframe(train_records)
+test_df = get_dataframe(test_records)
+# Utility method to filter out certain features which are of use
+def filter_features(data, features):
+    return data[features]
+# Filtering the test dataframes for features we selected
+features = ['title', 'categories', 'abstract', 'update_date']
+train_df = filter_features(train_df, features)
+test_df = filter_features(test_df, features)
+# define the corpus to pull from
+train_corpus = train_df['abstract'].head(10000)
+test_corpus = test_df['abstract'].head(10000)
+train_df.head()
+test_df.head()
+# train_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/train_df.csv', index = False)
+# test_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/test_df.csv', index = False)
+"""## Removing unnecessary words"""
+import nltk
+nltk.download('book')
+from nltk.book import *
+from nltk.tokenize import sent_tokenize, word_tokenize
+def tokenize_POS(paragraph):
+  words = word_tokenize(paragraph)
+  tagged_words = nltk.pos_tag(words)
+  # Remove not important types of words
+  excluded_tags = ['CC', 'DT', 'IN', 'TO', 'PRP', 'PRP$', 'MD', 'WP', 'WP$', 'WRB']
+  filtered_words = [word for word, pos in tagged_words if pos not in excluded_tags]
+  return ' '.join(filtered_words)
+# define the corpus to pull from
+train_corpus = train_df['abstract'].head(10000)
+test_corpus = test_df['abstract'].head(10000)
+train_corpus[0]
+train_corpus = pd.Series([tokenize_POS(abstract) for abstract in train_corpus])
+train_corpus[0]
+train_df_doc = train_df
+train_df_doc['abstract'] = train_df_doc['abstract'].apply(tokenize_POS)
+"""# TF-IDF"""
+import time # for getting the runtime of cells
+from sklearn.feature_extraction.text import TfidfVectorizer # for building word representations
+from sklearn.metrics.pairwise import cosine_similarity # for getting similarity metrics
+def get_recommendations_TFIDF(abstract):
+  abstract = tokenize_POS(abstract)
+  corpus = pd.concat([train_corpus, pd.Series(abstract)], ignore_index=True)
+  # Initialize an instance of tf-idf Vectorizer
+  tfidf_vectorizer = TfidfVectorizer()
+  # Generate the tf-idf vectors for the corpus
+  tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
+  # compute and print the cosine similarity matrix
+  cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
+  # Get the pairwise similarity scores
+  sim_scores = list(enumerate(cosine_sim[-1]))
+  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
+  paper_indices = sim_scores[2][0]
+  title = train_df['title'].iloc[paper_indices]
+  categories = train_df['categories'].iloc[paper_indices]
+  abstract = train_df['abstract'].iloc[paper_indices]
+  similarity = "{:.2f}%".format(sim_scores[2][1] * 100)  # Format similarity as a string with two decimal places and a percentage sign
+  return title, categories, abstract, similarity
+get_recommendations_TFIDF('''
+In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n
+''')
+"""# Doc2Vec"""
+import time
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from sklearn.metrics.pairwise import cosine_similarity
+def train_doc2vec_model(corpus, vector_size=100, window=5, min_count=1, epochs=100):
+    # create tagged document object
+    tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(corpus)] # words=word_tokenize(doc.lower())
+    # initialize doc2vec model
+    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
+    # build vocabulary
+    model.build_vocab(tagged_data)
+    # train doc2vec model
+    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
+    return model
+doc2vec_model = train_doc2vec_model(train_corpus)
+def get_recommendations_Doc2Vec(abstract):
+    # remove unnecessary words
+    abstract = tokenize_POS(abstract)
+    # train the model
+    model = doc2vec_model
+    # infer the vector for the given abstract
+    abstract_vector = model.infer_vector(abstract.split())
+    # get the most similar abstract
+    most_similar = model.dv.most_similar([abstract_vector], topn=2)
+    # Check if the first result is the input abstract
+    if train_df_doc['abstract'].iloc[int(most_similar[0][0])].split() == abstract.split():
+        # print('True')
+        paper_indices = int(most_similar[1][0])
+        similarity = "{:.2f}%".format(most_similar[1][1] * 100)  # Format similarity as a string with two decimal places and a percentage sign
+    else:
+        # print('False')
+        paper_indices = int(most_similar[0][0])
+        similarity = "{:.2f}%".format(most_similar[0][1] * 100)  # Format similarity as a string with two decimal places and a percentage sign
+    # Retrieve the details of the most similar abstract
+    title = train_df['title'].iloc[paper_indices]
+    categories = train_df['categories'].iloc[paper_indices]
+    abstract = train_df['abstract'].iloc[paper_indices]
+    return title, categories, abstract, similarity
+"""# Deploy"""
+# !pip install gradio
 import gradio as gr
+def greet(paper):
+    title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf = get_recommendations_TFIDF(paper)
+    title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec = get_recommendations_Doc2Vec(paper)
+    return title_tfidf, categories_tfidf, abstract_tfidf, title_doc2vec, categories_doc2vec, abstract_doc2vec
+title = '''SimSearch:\n
+A Similarity Search Tool for Research Paper Abstracts using NLP'''
+demo = gr.Interface(
+    title = title,
+    fn = greet,
+    inputs=gr.Textbox(placeholder='Abstract Here'),
+    outputs=[gr.outputs.Textbox(label='''TFIDF Title'''),
+             gr.outputs.Textbox(label='TFIDF Categories'),
+             gr.outputs.Textbox(label='TFIDF Abstract'),
+            #  gr.outputs.Textbox(label='Similarity')
+             gr.outputs.Textbox(label='''Doc2Vec Title'''),
+             gr.outputs.Textbox(label='Doc2Vec Categories'),
+             gr.outputs.Textbox(label='Doc2Vec Abstract'),
+             ],
+    examples = [
+        # ['''In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n'''],
+                ['''A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events.'''],
+                ['''Most physical experiments are usually described repeated measurements random variables . experimental data registered on-line computers form time series outcomes . frequencies different outcomes are compared probabilities provided algorithms quantum theory ( QT ) . spite statistical predictions QT claim was made theory provided most complete description data underlying physical phenomena . claim be easily rejected fine structures , averaged out standard statistical descriptive analysis , were found time series experimental data . search structures one has use more subtle statistical tools which were developed study time series produced various stochastic processes . talk review tools . example show standard descriptive statistical analysis data is unable reveal fine structure simulated sample AR ( 2 ) stochastic process . emphasize once again violation Bell inequalities gives information completeness non locality QT . appropriate way test completeness quantum theory is search fine structures time series experimental data means purity tests studying autocorrelation partial autocorrelation functions .''']]
+    )
+demo.launch(share=True, debug=True)