paper-matching / input_format.py
jskim's picture
pdf input removed. now retrieving urls for reviewer papers.
81ca652
raw
history blame
2.67 kB
import numpy as np
from pypdf import PdfReader
from urllib.parse import urlparse
import requests
from semanticscholar import SemanticScholar
### Input Formatting Module
## Input formatting for the given paper
# Extracting text from a pdf or a link
def get_text_from_pdf(file_path):
"""
Convert a pdf to list of text files
"""
reader = PdfReader(file_path)
text = []
for p in reader.pages:
t = p.extract_text()
text.append(t)
return text
def get_text_from_url(url, file_path='paper.pdf'):
"""
Get text of the paper from a url
"""
## Check for different URL cases
url_parts = urlparse(url)
# arxiv
if 'arxiv' in url_parts.netloc:
if 'abs' in url_parts.path:
# abstract page, change the url to pdf link
paper_id = url_parts.path.split('/')[-1]
url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
elif 'pdf' in url_parts.path:
# pdf file, pass
pass
else:
raise ValueError('invalid url')
else:
raise ValueError('invalid url')
# download the file
download_pdf(url, file_path)
# get the text from the pdf file
text = get_text_from_pdf(file_path)
return text
def download_pdf(url, file_name):
"""
Download the pdf file from given url and save it as file_name
"""
# Send GET request
response = requests.get(url)
# Save the PDF
if response.status_code == 200:
with open(file_name, "wb") as f:
f.write(response.content)
elif response.status_code == 404:
raise ValueError('cannot download the file')
else:
print(response.status_code)
## Input formatting for the given author (reviewer)
# Extracting text from a link
def get_text_from_author_id(author_id, max_count=100):
if author_id is None:
raise ValueError('Input valid author ID')
aid = str(author_id)
if 'http' in aid: # handle semantic scholar url input
aid = aid.split('/')
aid = aid[aid.index('author')+2]
url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid
r = requests.get(url)
if r.status_code == 404:
raise ValueError('Author link not found.')
data = r.json()
papers = data['papers'][:max_count]
name = data['name']
return name, papers
## TODO Preprocess Extracted Texts from PDFs
# Get a portion of the text for actual task
def get_title(text):
pass
def get_abstract(text):
pass
def get_introduction(text):
pass
def get_conclusion(text):
pass