Spaces:

yixin6178
/

arXiv2Latex

Running

File size: 3,603 Bytes

# import imp
import streamlit as st
import pandas as pd
import numpy as np
import time
# import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.figure_factory as ff
import altair as alt
# from PIL import Image
import base64
import tarfile
import os
import requests
from backend import *

predefined_limits = 10000

st.set_page_config(page_title="arXiv2Latex Downloader", page_icon=":page_with_curl:", layout="wide", initial_sidebar_state="expanded", menu_items={
    "About": "Download the source latex code of multiple arXiv paper with one click"
})

# title
st.title("arXiv2Latex Downloader")

# input arxiv links to download
pdf_links_input = st.text_area("Please input the paper links you want to download following the format (Currently supports up to 10 links).", "")
st.markdown("""
            Input example:
            ```Plain Text
            https://arxiv.org/abs/1512.03385
            https://arxiv.org/abs/1706.03762
            https://arxiv.org/abs/2009.09724
            """)
## one click download
crawling_or_not = st.button("Crawling the latex Code")
if crawling_or_not:
    print("Crawling...")
    pdf_lists = pdf_links_input.split("\n")
    print(pdf_lists)
    # cleaning the pdf lists
    pdf_lists = [i.strip() for i in pdf_lists if len(i) > 0]
    # TODO: limit the number of paper up to 10 since I am not sure that whether base64 support large file download
    # try: 
    if len(pdf_lists) > predefined_limits:
        st.warning(f"Currently only support up to {predefined_limits} papers. Please input less than {predefined_limits} papers.")
    else:
        # parsing
        base='./download/'
        project_name = get_timestamp().replace(" ","-")
        base = os.path.join(base,project_name)
        make_dir_if_not_exist(base)
        
        # st.write(download_status)
        with st.spinner("Downloading papers..."):
            # progress bar
            bar = st.progress(0)
            download_status = st.empty()
            N = len(pdf_lists)
            for i, pdf_link in tqdm(enumerate(pdf_lists)):
                title = get_name_from_arvix(pdf_link)
                file_stamp = pdf_link.split("/")[-1]
                source_link = "https://arxiv.org/e-print/"+file_stamp
                inp = os.path.join(base,'input')
                make_dir_if_not_exist(inp)
                out = os.path.join(base,'output')
                make_dir_if_not_exist(out)
                response = requests.get(source_link)
                filename = file_stamp+".tar.gz"
                filepath = os.path.join(inp,filename)
                open(filepath, "wb").write(response.content)
                outpath = os.path.join(out,title)
                untar(filepath,outpath)
                
                # finish one paper
                bar.progress((i+1)/N)
                download_status.text(f"Iteration [{i+1}/{N}]: Finish Downloading of "+title)
        
        with st.spinner("Archiving as Zip Files..."):
            # save it as zip file
            filepath = archive_dir(out,os.path.join(base,project_name))

            # download
            b64 = ToBase64(filepath).decode()
        href = f"<a href='data:file/csv;base64,{b64}' download='arxiv2latex-output-{datetime.datetime.now()}.zip' color='red'>Click here to Download the Output Latex Zip Files</a>"
        st.markdown(href, unsafe_allow_html=True)
            
        # 状态
        st.success("Finished")
    # except Exception as e:
    #     st.error("Something goes wrong. Please check the input or concat me to fix this bug. Error message: \n"+str(e))