Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Feb 22, 2023

Commit

6eff5e7

1 Parent(s): 82d22b3

init files

Browse files

Files changed (4) hide show

app.py +196 -0
input_format.py +114 -0
requirements.txt +83 -0
score.py +149 -0

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import gradio as gr
+import os
+from transformers import AutoTokenizer, AutoModel
+from sentence_transformers import SentenceTransformer
+import pickle
+from input_format import *
+from score import *
+# load document scoring model
+pretrained_model = 'allenai/specter'
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
+doc_model = AutoModel.from_pretrained(pretrained_model)
+# load sentence model
+sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
+def get_similar_paper(
+    abstract_text_input,
+    pdf_file_input,
+    author_id_input,
+    num_papers_show=10
+):
+    input_sentences = sent_tokenize(abstract_text_input)
+    pickle.dump(input_sentences, open('tmp_input_sents.pkl', 'wb'))
+    # TODO handle pdf file input
+    if pdf_file_input is not None:
+        name = None
+        papers = []
+        raise ValueError('Use submission abstract instead.')
+    else:
+        # Get author papers from id
+        name, papers = get_text_from_author_id(author_id_input)
+    # Compute Doc-level affinity scores for the Papers
+    titles, abstracts, doc_scores = compute_overall_score(
+        doc_model,
+        tokenizer,
+        abstract_text_input,
+        papers,
+        batch=30
+    )
+    tmp = {
+        'titles': titles,
+        'abstracts': abstracts,
+        'doc_scores': doc_scores
+    }
+    pickle.dump(tmp, open('tmp_paperinfo.pkl', 'wb'))
+    # Select top K choices of papers to show
+    titles = titles[:num_papers_show]
+    abstracts = abstracts[:num_papers_show]
+    doc_scores = doc_scores[:num_papers_show]
+    return titles[0], abstracts[0], doc_scores[0], gr.update(choices=input_sentences, interactive=True), gr.update(visible=True)
+def get_highlights(
+    abstract_text_input,
+    pdf_file_input,
+    abstract,
+    K=2
+):
+    # Compute sent-level and phrase-level affinity scores for each papers
+    sent_ids, sent_scores, info = get_highlight_info(
+        sent_model,
+        abstract_text_input,
+        abstract,
+        K=K
+    )
+    input_sentences = sent_tokenize(abstract_text_input)
+    num_sents = len(input_sentences)
+    word_scores = dict()
+    # different highlights for each input sentences
+    for i in range(num_sents):
+        word_scores[str(i)] = {
+            "original": abstract,
+            "interpretation": list(zip(info['all_words'], info[i]['scores']))
+        }
+    tmp = {
+        'source_sentences': input_sentences,
+        'highlight': word_scores
+    }
+    pickle.dump(tmp, open('highlight_info.pkl', 'wb'))
+    # update the visibility of radio choices
+    return gr.update(visible=True)
+def update_name(author_id_input):
+    # update the name of the author based on the id input
+    name, _ = get_text_from_author_id(author_id_input)
+    return gr.update(value=name)
+def change_output_highlight(source_sent_choice):
+    # change the output highlight based on the sentence selected from the submission
+    if os.path.exists('highlight_info.pkl'):
+        tmp = pickle.load(open('highlight_info.pkl', 'rb'))
+        source_sents = tmp['source_sentences']
+        highlights = tmp['highlight']
+        for i, s in enumerate(source_sents):
+            print('changing highlight!')
+            if source_sent_choice == s:
+                return highlights[str(i)]
+    else:
+        return
+with gr.Blocks() as demo:
+    ### INPUT
+    with gr.Row() as input_row:
+        with gr.Column():
+            abstract_text_input = gr.Textbox(label='Submission Abstract')
+        with gr.Column():
+            pdf_file_input = gr.File(label='OR upload a submission PDF File')
+        with gr.Column():
+            with gr.Row():
+                author_id_input = gr.Textbox(label='Reviewer ID (Semantic Scholar)')
+            with gr.Row():
+                name = gr.Textbox(label='Confirm Reviewer Name', interactive=False)
+                author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
+    with gr.Row():
+        compute_btn = gr.Button('Search Similar Papers from the Reviewer')
+    # with gr.Row(visible=False) as reviewer_name_info:
+    #     name = gr.Textbox(label='Reveiwer Author Name')
+    # with gr.Row():
+    #     with gr.Tabs():
+    #         for tt in range(num_papers_show):
+    #             with gr.TabItem('Paper %d'%(tt+1)):
+    # TODO handle multiple papers
+    ### PAPER INFORMATION
+    with gr.Row():
+        with gr.Column(scale=3):
+            paper_title = gr.Textbox(label='Title', interactive=False)
+        with gr.Column(scale=1):
+            affinity= gr.Number(label='Affinity', interactive=False, value=0)
+    with gr.Row():
+        paper_abstract = gr.Textbox(label='Abstract', interactive=False)
+    with gr.Row(visible=False) as explain_button_row:
+        explain_btn = gr.Button('Show Relevant Parts from Selected Paper')
+    ### RELEVANT PARTS (HIGHLIGHTS)
+    with gr.Row():
+        with gr.Column(scale=2): # text from submission
+            source_sentences = gr.Radio(
+                choices=[],
+                visible=False,
+                label='Sentences from Submission Abstract',
+            )
+        with gr.Column(scale=3): # highlighted text from paper
+            highlight = gr.components.Interpretation(paper_abstract)
+    compute_btn.click(
+        fn=get_similar_paper,
+        inputs=[
+            abstract_text_input,
+            pdf_file_input,
+            author_id_input
+        ],
+        outputs=[
+            paper_title,
+            paper_abstract,
+            affinity,
+            source_sentences,
+            explain_button_row
+        ]
+    )
+    explain_btn.click(
+        fn=get_highlights,
+        inputs=[
+            abstract_text_input,
+            pdf_file_input,
+            paper_abstract
+        ],
+        outputs=source_sentences
+    )
+    source_sentences.change(
+        fn=change_output_highlight,
+        inputs=source_sentences,
+        outputs=highlight
+    )
+if __name__ == "__main__":
+    demo.launch()

input_format.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import numpy as np
+from pypdf import PdfReader
+from urllib.parse import urlparse
+import requests
+from semanticscholar import SemanticScholar
+### Input Formatting Module
+## Input formatting for the given paper
+# Extracting text from a pdf or a link
+def get_text_from_pdf(file_path):
+    """
+    Convert a pdf to list of text files
+    """
+    reader = PdfReader(file_path)
+    text = []
+    for p in reader.pages:
+        t = p.extract_text()
+        text.append(t)
+    return text
+def get_text_from_url(url, file_path='paper.pdf'):
+    """
+    Get text of the paper from a url
+    """
+    # TODO check for other valid urls (e.g. semantic scholar)
+    ## Check for different URL cases
+    url_parts = urlparse(url)
+    # arxiv
+    if 'arxiv' in url_parts.netloc:
+        if 'abs' in url_parts.path:
+            # abstract page, change the url to pdf link
+            paper_id = url_parts.path.split('/')[-1]
+            url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
+        elif 'pdf' in url_parts.path:
+            # pdf file, pass
+            pass
+        else:
+            raise ValueError('invalid url')
+    else:
+        raise ValueError('invalid url')
+    # download the file
+    download_pdf(url, file_path)
+    # get the text from the pdf file
+    text = get_text_from_pdf(file_path)
+    return text
+def download_pdf(url, file_name):
+    """
+    Download the pdf file from given url and save it as file_name
+    """
+    # Send GET request
+    response = requests.get(url)
+    # Save the PDF
+    if response.status_code == 200:
+        with open(file_name, "wb") as f:
+            f.write(response.content)
+    elif response.status_code == 404:
+        raise ValueError('cannot download the file')
+    else:
+        print(response.status_code)
+## Input formatting for the given author (reviewer)
+# Extracting text from a link
+def get_text_from_author_id(author_id, max_count=100):
+    if author_id is None:
+        raise ValueError('Input valid author ID')
+    author_id = str(author_id)
+#     author_id = '1737249'
+    url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%author_id
+    r = requests.get(url)
+    if r.status_code == 404:
+        raise ValueError('Input valid author ID')
+    data = r.json()
+    papers = data['papers'][:max_count]
+    name = data['name']
+    return name, papers
+## TODO Preprocess Extracted Texts from PDFs
+# Get a portion of the text for actual task
+def get_title(text):
+    pass
+def get_abstract(text):
+    pass
+def get_introduction(text):
+    pass
+def get_conclusion(text):
+    pass
+if __name__ == '__main__':
+    def run_sample():
+        url = 'https://arxiv.org/abs/2105.06506'
+        text = get_text_from_url(url)
+        assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
+        text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf')
+        assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
+        # text = get_text_from_url('https://arxiv.org/paetseths.pdf')
+    # test the code
+    run_sample()

requirements.txt ADDED Viewed

	@@ -0,0 +1,83 @@

+gradio==3.19.1
+huggingface-hub==0.8.1
+nltk==3.7
+numpy==1.21.6
+py-pdf-parser==0.10.2
+py-rouge==1.1
+pypdf==3.3.0
+pyrogue==0.0.2
+requests==2.28.1
+rouge-score==0.1.2
+scikit-learn==1.0.2
+scipy==1.7.3
+scs==2.1.4
+seaborn==0.11.2
+segtok==1.5.11
+semanticscholar==0.3.2
+sentence-transformers==2.2.0
+sentencepiece==0.1.96
+sentry-sdk==1.9.0
+setproctitle==1.3.0
+shap==0.40.0
+shapely==2.0.0
+shortuuid==1.0.9
+six @ file:///tmp/build/80754af9/six_1623709665295/work
+sklearn==0.0
+slicer==0.0.7
+smart-open==5.2.1
+smmap==5.0.0
+sniffio==1.2.0
+spacy==3.0.8
+spacy-legacy==3.0.9
+spacy-loggers==1.0.3
+sqlitedict==2.0.0
+srsly==2.4.4
+starlette==0.22.0
+statsmodels==0.13.2
+tabulate==0.8.9
+tea==0.1.4
+tea-client==0.0.7
+tea-console==0.0.6
+tenacity==8.1.0
+tensorboardX==2.5.1
+termcolor==1.1.0
+terminado==0.9.4
+testpath @ file:///tmp/build/80754af9/testpath_1624638946665/work
+text-unidecode==1.3
+thinc==8.0.17
+threadpoolctl==2.2.0
+tifffile==2021.11.2
+tld==0.10
+tokenizers==0.10.3
+tomli==2.0.1
+toolz==0.12.0
+torch==1.9.0
+torchaudio==0.10.2
+torchdata==0.3.0
+torchtext==0.12.0
+torchvision==0.8.2
+tornado @ file:///tmp/build/80754af9/tornado_1606942283357/work
+tqdm==4.62.2
+traitlets==5.3.0
+transformers==4.3.3
+transformers-interpret==0.5.2
+typer==0.3.2
+typing-extensions @ file:///tmp/build/80754af9/typing_extensions_1624965014186/work
+tzlocal==2.1
+uc-micro-py==1.0.1
+urllib3==1.26.6
+uvicorn==0.20.0
+Wand==0.6.10
+wandb==0.12.21
+wasabi==0.10.1
+wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work
+webencodings==0.5.1
+websockets==10.4
+Werkzeug==2.2.2
+widgetsnbextension==3.5.1
+Wikipedia-API==0.5.4
+word2number==1.1
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.7.2
+zipp @ file:///tmp/build/80754af9/zipp_1625570634446/work

score.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from sentence_transformers import util
+from nltk.tokenize import sent_tokenize
+import torch
+import numpy as np
+def compute_sentencewise_scores(model, query_sents, candidate_sents):
+    # list of sentences from query and candidate
+    q_v, c_v = get_embedding(model, query_sents, candidate_sents)
+    return util.cos_sim(q_v, c_v)
+def get_embedding(model, query_sents, candidate_sents):
+    q_v = model.encode(query_sents)
+    c_v = model.encode(candidate_sents)
+    return q_v, c_v
+def get_top_k(score_mat, K=3):
+    """
+    Pick top K sentences to show
+    """
+    idx = torch.argsort(-score_mat)
+    picked_sent = idx[:,:K]
+    picked_scores = torch.vstack(
+        [score_mat[i,picked_sent[i]] for i in range(picked_sent.shape[0])]
+    )
+    return picked_sent, picked_scores
+def get_words(sent):
+    words = []
+    sent_start_id = [] # keep track of the word index where the new sentence starts
+    counter = 0
+    for x in sent:
+        w = x.split()
+        nw = len(w)
+        counter += nw
+        words.append(w)
+        sent_start_id.append(counter)
+    words = [x.split() for x in sent]
+    all_words = [item for sublist in words for item in sublist]
+    sent_start_id.pop()
+    sent_start_id = [0] + sent_start_id
+    assert(len(sent_start_id) == len(sent))
+    return words, all_words, sent_start_id
+def mark_words(words, all_words, sent_start_id, sent_ids, sent_scores):
+    num_query_sent = sent_ids.shape[0]
+    num_words = len(all_words)
+    output = dict()
+    output['all_words'] = all_words
+    output['words_by_sentence'] = words
+    # for each query sentence, mark the highlight information
+    for i in range(num_query_sent):
+        is_selected_sent = np.zeros(num_words)
+        is_selected_phrase = np.zeros(num_words)
+        word_scores = np.zeros(num_words) + 1e-4
+        # get sentence selection information
+        for sid, sscore in zip(sent_ids[i], sent_scores[i]):
+            #print(len(sent_start_id), sid, sid+1)
+            if sid+1 < len(sent_start_id):
+                sent_range = (sent_start_id[sid], sent_start_id[sid+1])
+                is_selected_sent[sent_range[0]:sent_range[1]] = 1
+                word_scores[sent_range[0]:sent_range[1]] = sscore
+            else:
+                is_selected_sent[sent_range[0]:] = 1
+                word_scores[sent_range[0]:] = sscore
+        # TODO get phrase selection information
+        output[i] = {
+            'is_selected_sent': is_selected_sent,
+            'is_selected_phrase': is_selected_phrase,
+            'scores': word_scores
+        }
+    return output
+def get_highlight_info(model, text1, text2, K=3):
+    sent1 = sent_tokenize(text1) # query
+    sent2 = sent_tokenize(text2) # candidate
+    score_mat = compute_sentencewise_scores(model, sent1, sent2)
+    sent_ids, sent_scores = get_top_k(score_mat, K=K)
+    #print(sent_ids, sent_scores)
+    words1, all_words1, sent_start_id1 = get_words(sent2)
+    #print(all_words1, sent_start_id1)
+    info = mark_words(words1, all_words1, sent_start_id1, sent_ids, sent_scores)
+    return sent_ids, sent_scores, info
+## Document-level operations
+def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
+    # concatenate title and abstract
+    title_abs = []
+    for t, a in zip(titles, abstracts):
+        if t is not None and a is not None:
+            title_abs.append(t + ' [SEP] ' + a)
+    num_docs = len(title_abs)
+    no_iter = int(np.ceil(num_docs / batch))
+    # preprocess the input
+    scores = []
+    with torch.no_grad():
+        # batch
+        for i in range(no_iter):
+            inputs = tokenizer(
+                [query] + title_abs[i*batch:(i+1)*batch],
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+                max_length=512
+            )
+            inputs.to(doc_model.device)
+            result = doc_model(**inputs)
+            # take the first token in the batch as the embedding
+            embeddings = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
+            # compute cosine similarity
+            q_emb = embeddings[0,:]
+            p_emb = embeddings[1:,:]
+            nn = np.linalg.norm(q_emb) * np.linalg.norm(p_emb, axis=1)
+            scores += list(np.dot(p_emb, q_emb) / nn)
+    assert(len(scores) == num_docs)
+    return scores
+def compute_overall_score(doc_model, tokenizer, query, papers, batch=5):
+    scores = []
+    titles = []
+    abstracts = []
+    for p in papers:
+        titles.append(p['title'])
+        abstracts.append(p['abstract'])
+    scores = predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=batch)
+    idx_sorted = np.argsort(scores)[::-1]
+    titles_sorted = [titles[x] for x in idx_sorted]
+    abstracts_sorted = [abstracts[x] for x in idx_sorted]
+    scores_sorted = [scores[x] for x in idx_sorted]
+    return titles_sorted, abstracts_sorted, scores_sorted