Spaces:
Runtime error
Runtime error
import numpy as np | |
from pypdf import PdfReader | |
from urllib.parse import urlparse | |
import requests | |
from semanticscholar import SemanticScholar | |
### Input Formatting Module | |
## Input formatting for the given paper | |
# Extracting text from a pdf or a link | |
def get_text_from_pdf(file_path): | |
""" | |
Convert a pdf to list of text files | |
""" | |
reader = PdfReader(file_path) | |
text = [] | |
for p in reader.pages: | |
t = p.extract_text() | |
text.append(t) | |
return text | |
def get_text_from_url(url, file_path='paper.pdf'): | |
""" | |
Get text of the paper from a url | |
""" | |
## Check for different URL cases | |
url_parts = urlparse(url) | |
# arxiv | |
if 'arxiv' in url_parts.netloc: | |
if 'abs' in url_parts.path: | |
# abstract page, change the url to pdf link | |
paper_id = url_parts.path.split('/')[-1] | |
url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id) | |
elif 'pdf' in url_parts.path: | |
# pdf file, pass | |
pass | |
else: | |
raise ValueError('invalid url') | |
else: | |
raise ValueError('invalid url') | |
# download the file | |
download_pdf(url, file_path) | |
# get the text from the pdf file | |
text = get_text_from_pdf(file_path) | |
return text | |
def download_pdf(url, file_name): | |
""" | |
Download the pdf file from given url and save it as file_name | |
""" | |
# Send GET request | |
response = requests.get(url) | |
# Save the PDF | |
if response.status_code == 200: | |
with open(file_name, "wb") as f: | |
f.write(response.content) | |
elif response.status_code == 404: | |
raise ValueError('cannot download the file') | |
else: | |
print(response.status_code) | |
## Input formatting for the given author (reviewer) | |
# Extracting text from a link | |
def get_text_from_author_id(author_id, max_count=100): | |
if author_id is None: | |
raise ValueError('Input valid author ID') | |
aid = str(author_id) | |
if 'http' in aid: # handle semantic scholar url input | |
aid = aid.split('/') | |
aid = aid[aid.index('author')+2] | |
url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid | |
r = requests.get(url) | |
if r.status_code == 404: | |
raise ValueError('Author link not found.') | |
data = r.json() | |
papers = data['papers'][:max_count] | |
name = data['name'] | |
return name, papers | |
## TODO Preprocess Extracted Texts from PDFs | |
# Get a portion of the text for actual task | |
def get_title(text): | |
pass | |
def get_abstract(text): | |
pass | |
def get_introduction(text): | |
pass | |
def get_conclusion(text): | |
pass |