File size: 2,459 Bytes
6be34e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import tarfile
import os
import requests
import datetime
import pandas as pd
import shutil
from bs4 import  BeautifulSoup
from tqdm import tqdm
import base64

def ToBase64(file):
    with open(file, 'rb') as fileObj:
        data = fileObj.read()
    base64_data = base64.b64encode(data)
    return base64_data

def archive_dir(dir_name,output_filename,format="zip"):
    shutil.make_archive(output_filename, format, dir_name)
    return output_filename+".zip"
    
def make_dir_if_not_exist(folder):
  if not os.path.exists(folder):
    os.makedirs(folder)

def untar(fname, dirs):
    """
    解压tar.gz文件
    :param fname: 压缩文件名
    :param dirs: 解压后的存放路径
    :return: bool
    """

    try:
        t = tarfile.open(fname)
        t.extractall(path = dirs)
        return True
    except Exception as e:
        print(e)
        return False
    
def get_timestamp():
    ts = pd.to_datetime(str(datetime.datetime.now()))
    d = ts.strftime('%Y%m%d%H%M%S')
    return d

def get_name_from_arvix(url):
    res = BeautifulSoup(requests.get(url).content, 'lxml').find("h1",attrs={"class":"title mathjax"})
    if res is None:
        return ''
    title = res.text[6:].replace(" ","-")
    return title

def download_source(pdf_lists=None,output_base=None,project_name=None,fetch_title=True, return_source=False):
    base=output_base
    project_name = project_name + get_timestamp()
    base = os.path.join(base,project_name)
    make_dir_if_not_exist(base)
    
    for pdf_link in tqdm(pdf_lists):
        file_stamp = pdf_link.split("/")[-1]
        if fetch_title:
            title = get_name_from_arvix(pdf_link)
            if len(title )== 0:
                continue
        else:
            import numpy as np
            title = file_stamp
        source_link = "https://arxiv.org/e-print/"+file_stamp
        inp = os.path.join(base,'input')
        make_dir_if_not_exist(inp)
        out = os.path.join(base,'output')
        make_dir_if_not_exist(out)
        if return_source:
            print(source_link)
            continue
        response = requests.get(source_link)
        filename = file_stamp+".tar.gz"
        filepath = os.path.join(inp,filename)
        open(filepath, "wb").write(response.content)
        outpath = os.path.join(out,title)
        untar(filepath,outpath)
    archive_dir(out,os.path.join(base,project_name))

if __name__ == '__main__':
    s = get_timestamp()
    print(s)