File size: 2,666 Bytes
6eff5e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fad322
 
 
 
81ca652
6eff5e7
 
2fad322
6eff5e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963bf46
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
from pypdf import PdfReader
from urllib.parse import urlparse
import requests
from semanticscholar import SemanticScholar

### Input Formatting Module

## Input formatting for the given paper
# Extracting text from a pdf or a link

def get_text_from_pdf(file_path):
    """
    Convert a pdf to list of text files
    """
    reader = PdfReader(file_path)
    text = []
    for p in reader.pages:
        t = p.extract_text()
        text.append(t)
    return text

def get_text_from_url(url, file_path='paper.pdf'):
    """
    Get text of the paper from a url
    """
    ## Check for different URL cases
    url_parts = urlparse(url)
    # arxiv
    if 'arxiv' in url_parts.netloc:
        if 'abs' in url_parts.path:
            # abstract page, change the url to pdf link
            paper_id = url_parts.path.split('/')[-1]
            url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
        elif 'pdf' in url_parts.path:
            # pdf file, pass
            pass
        else:
            raise ValueError('invalid url')
    else:
        raise ValueError('invalid url')

    # download the file
    download_pdf(url, file_path)

    # get the text from the pdf file
    text = get_text_from_pdf(file_path)
    return text
    
def download_pdf(url, file_name):
    """
    Download the pdf file from given url and save it as file_name
    """
    # Send GET request
    response = requests.get(url)

    # Save the PDF
    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
    elif response.status_code == 404:
        raise ValueError('cannot download the file')
    else:
        print(response.status_code)
        
## Input formatting for the given author (reviewer)
# Extracting text from a link

def get_text_from_author_id(author_id, max_count=100):
    if author_id is None:
        raise ValueError('Input valid author ID')
    aid = str(author_id)
    if 'http' in aid: # handle semantic scholar url input
        aid = aid.split('/')
        aid = aid[aid.index('author')+2]
    url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid
    r = requests.get(url)
    if r.status_code == 404:
        raise ValueError('Author link not found.')
    data = r.json()
    papers = data['papers'][:max_count]
    name = data['name']

    return name, papers

## TODO Preprocess Extracted Texts from PDFs
# Get a portion of the text for actual task

def get_title(text):
    pass

def get_abstract(text):
    pass

def get_introduction(text):
    pass

def get_conclusion(text):
    pass