import numpy as np from pypdf import PdfReader from urllib.parse import urlparse import requests from semanticscholar import SemanticScholar ### Input Formatting Module ## Input formatting for the given paper # Extracting text from a pdf or a link def get_text_from_pdf(file_path): """ Convert a pdf to list of text files """ reader = PdfReader(file_path) text = [] for p in reader.pages: t = p.extract_text() text.append(t) return text def get_text_from_url(url, file_path='paper.pdf'): """ Get text of the paper from a url """ # TODO check for other valid urls (e.g. semantic scholar) ## Check for different URL cases url_parts = urlparse(url) # arxiv if 'arxiv' in url_parts.netloc: if 'abs' in url_parts.path: # abstract page, change the url to pdf link paper_id = url_parts.path.split('/')[-1] url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id) elif 'pdf' in url_parts.path: # pdf file, pass pass else: raise ValueError('invalid url') else: raise ValueError('invalid url') # download the file download_pdf(url, file_path) # get the text from the pdf file text = get_text_from_pdf(file_path) return text def download_pdf(url, file_name): """ Download the pdf file from given url and save it as file_name """ # Send GET request response = requests.get(url) # Save the PDF if response.status_code == 200: with open(file_name, "wb") as f: f.write(response.content) elif response.status_code == 404: raise ValueError('cannot download the file') else: print(response.status_code) ## Input formatting for the given author (reviewer) # Extracting text from a link def get_text_from_author_id(author_id, max_count=100): if author_id is None: raise ValueError('Input valid author ID') author_id = str(author_id) # author_id = '1737249' url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%author_id r = requests.get(url) if r.status_code == 404: raise ValueError('Input valid author ID') data = r.json() papers = data['papers'][:max_count] name = data['name'] return name, papers ## TODO Preprocess Extracted Texts from PDFs # Get a portion of the text for actual task def get_title(text): pass def get_abstract(text): pass def get_introduction(text): pass def get_conclusion(text): pass if __name__ == '__main__': def run_sample(): url = 'https://arxiv.org/abs/2105.06506' text = get_text_from_url(url) assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods') text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf') assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods') # text = get_text_from_url('https://arxiv.org/paetseths.pdf') # test the code run_sample()