import numpy as np from pypdf import PdfReader from urllib.parse import urlparse import requests from semanticscholar import SemanticScholar ### Input Formatting Module ## Input formatting for the given paper # Extracting text from a pdf or a link def get_text_from_pdf(file_path): """ Convert a pdf to list of text files """ reader = PdfReader(file_path) text = [] for p in reader.pages: t = p.extract_text() text.append(t) return text def get_text_from_url(url, file_path='paper.pdf'): """ Get text of the paper from a url """ ## Check for different URL cases url_parts = urlparse(url) # arxiv if 'arxiv' in url_parts.netloc: if 'abs' in url_parts.path: # abstract page, change the url to pdf link paper_id = url_parts.path.split('/')[-1] url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id) elif 'pdf' in url_parts.path: # pdf file, pass pass else: raise ValueError('invalid url') else: raise ValueError('invalid url') # download the file download_pdf(url, file_path) # get the text from the pdf file text = get_text_from_pdf(file_path) return text def download_pdf(url, file_name): """ Download the pdf file from given url and save it as file_name """ # Send GET request response = requests.get(url) # Save the PDF if response.status_code == 200: with open(file_name, "wb") as f: f.write(response.content) elif response.status_code == 404: raise ValueError('cannot download the file') else: print(response.status_code) ## Input formatting for the given author (reviewer) # Extracting text from a link def get_text_from_author_id(author_id, max_count=100): if author_id is None: raise ValueError('Input valid author ID') aid = str(author_id) if 'http' in aid: # handle semantic scholar url input aid = aid.split('/') aid = aid[aid.index('author')+2] url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid r = requests.get(url) if r.status_code == 404: raise ValueError('Author link not found.') data = r.json() papers = data['papers'][:max_count] name = data['name'] return name, papers ## TODO Preprocess Extracted Texts from PDFs # Get a portion of the text for actual task def get_title(text): pass def get_abstract(text): pass def get_introduction(text): pass def get_conclusion(text): pass