Zhiming666 commited on
Commit
20809c7
1 Parent(s): 84d233f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +88 -0
  2. requirements.txt +242 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import xml.etree.ElementTree as ET
4
+ import urllib.request
5
+ import re
6
+ from datetime import datetime
7
+ import pandas as pd
8
+ from fpdf import FPDF
9
+ import gradio as gr
10
+
11
+ def get_arxiv_data(search_query, number):
12
+ url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
13
+ response = requests.get(url)
14
+ xml_data = response.text
15
+ root = ET.fromstring(xml_data)
16
+ entries = root.findall('{http://www.w3.org/2005/Atom}entry')
17
+ results = []
18
+
19
+ # Create folder for current date and time
20
+ current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
21
+ folder_path = os.path.join('data', current_time)
22
+ os.makedirs(folder_path, exist_ok=True)
23
+
24
+ for entry in entries:
25
+ title = entry.find('{http://www.w3.org/2005/Atom}title').text
26
+ link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href']
27
+ published = entry.find('{http://www.w3.org/2005/Atom}published').text
28
+ author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text
29
+
30
+ # Skip non-arXiv links
31
+ if not link.startswith('http://arxiv.org/'):
32
+ continue
33
+
34
+ result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n'
35
+ results.append(result_string)
36
+
37
+ # Download PDF file
38
+ pdf_link = link.replace('abs', 'pdf') + '.pdf'
39
+ filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
40
+ filepath = os.path.join(folder_path, filename)
41
+
42
+ try:
43
+ urllib.request.urlretrieve(pdf_link, filepath)
44
+ except Exception as e:
45
+ continue
46
+
47
+ # Save search query and results to PDF
48
+ pdf = FPDF()
49
+ pdf.add_page()
50
+ pdf.set_font('Arial', 'B', 16)
51
+ pdf.cell(0, 10, f"Search Query: {search_query}", ln=True)
52
+ pdf.set_font('Arial', '', 12)
53
+ for i, result in enumerate(results):
54
+ pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
55
+ pdf.ln(5) # Add newline after each result
56
+ pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf'))
57
+
58
+ # Save search query, results, and current time to Excel file
59
+ current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
60
+ df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
61
+ folder_path = 'data'
62
+ os.makedirs(folder_path, exist_ok=True)
63
+ excel_filepath = os.path.join(folder_path, 'information.xlsx')
64
+ if os.path.exists(excel_filepath):
65
+ existing_df = pd.read_excel(excel_filepath)
66
+ df = pd.concat([existing_df, df], ignore_index=True)
67
+ df.to_excel(excel_filepath, index=False)
68
+
69
+ return results
70
+
71
+ def search_arxiv(search_query, max_results):
72
+ start_time = datetime.now()
73
+ results = get_arxiv_data(search_query, max_results)
74
+ elapsed_time = datetime.now() - start_time
75
+ elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
76
+
77
+ return '\n'.join(results), elapsed_time_str
78
+
79
+ search_query_input = gr.inputs.Textbox(label="Search Query")
80
+ max_results_input = gr.inputs.Textbox(label="Max Results")
81
+
82
+ output_text = gr.outputs.Textbox(label="Results")
83
+ output_time = gr.outputs.Textbox(label="Elapsed Time")
84
+
85
+ title = "ArXiv Search"
86
+ description = "Crawling Papers on Arxiv"
87
+
88
+ gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()
requirements.txt ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ alabaster==0.7.13
5
+ altair==4.2.2
6
+ altgraph==0.17.3
7
+ anyio==3.6.2
8
+ argilla==1.7.0
9
+ arrow==1.2.3
10
+ arxiv==1.4.7
11
+ asttokens==2.2.1
12
+ async-generator==1.10
13
+ async-timeout==4.0.2
14
+ attrs==22.2.0
15
+ Babel==2.12.1
16
+ backcall==0.2.0
17
+ backoff==2.2.1
18
+ backports.functools-lru-cache==1.6.4
19
+ beautifulsoup4==4.12.2
20
+ bibtexparser==1.4.0
21
+ boto3==1.26.121
22
+ botocore==1.29.121
23
+ bottle==0.12.25
24
+ bottle-websocket==0.2.9
25
+ cachetools==5.3.0
26
+ certifi==2022.12.7
27
+ cffi==1.15.1
28
+ chardet==5.1.0
29
+ charset-normalizer==3.1.0
30
+ ci-info==0.3.0
31
+ click==8.1.3
32
+ cloudpickle==2.2.1
33
+ cmake==3.26.3
34
+ colorama==0.4.6
35
+ comm==0.1.3
36
+ configobj==5.0.8
37
+ configparser==5.3.0
38
+ contourpy==1.0.7
39
+ cryptography==40.0.2
40
+ cycler==0.11.0
41
+ dataclasses-json==0.5.7
42
+ debugpy==1.6.7
43
+ decorator==5.1.1
44
+ Deprecated==1.2.13
45
+ dnspython==2.3.0
46
+ docopt==0.6.2
47
+ docutils==0.18.1
48
+ Eel==0.16.0
49
+ entrypoints==0.4
50
+ et-xmlfile==1.1.0
51
+ etelemetry==0.3.0
52
+ exceptiongroup==1.1.1
53
+ executing==1.2.0
54
+ fake-useragent==1.1.3
55
+ fastapi==0.95.1
56
+ feedparser==6.0.10
57
+ ffmpy==0.3.0
58
+ filelock==3.12.0
59
+ fitz==0.0.1.dev2
60
+ fonttools==4.39.2
61
+ fpdf==1.7.2
62
+ free-proxy==1.1.1
63
+ frozenlist==1.3.3
64
+ fsspec==2023.4.0
65
+ future==0.18.3
66
+ gevent==22.10.2
67
+ gevent-websocket==0.10.1
68
+ googleapis-common-protos==1.59.0
69
+ gpt-index==0.6.2
70
+ gradio==3.28.1
71
+ gradio_client==0.1.3
72
+ greenlet==2.0.2
73
+ grpc-gateway-protoc-gen-openapiv2==0.1.0
74
+ grpcio==1.54.2
75
+ gym==0.26.2
76
+ gym-notices==0.0.8
77
+ h11==0.14.0
78
+ httpcore==0.16.3
79
+ httplib2==0.22.0
80
+ httpx==0.23.3
81
+ huggingface-hub==0.13.4
82
+ idna==3.4
83
+ imagesize==1.4.1
84
+ importlib-metadata==6.3.0
85
+ ipykernel==6.22.0
86
+ ipython==8.12.0
87
+ isodate==0.6.1
88
+ jedi==0.18.2
89
+ Jinja2==3.1.2
90
+ jmespath==1.0.1
91
+ joblib==1.2.0
92
+ jsonschema==4.17.3
93
+ jupyter_client==8.1.0
94
+ jupyter_core==5.3.0
95
+ kiwisolver==1.4.4
96
+ langchain==0.0.183
97
+ linkify-it-py==2.0.0
98
+ llama-index==0.6.4
99
+ loguru==0.7.0
100
+ lxml==4.9.2
101
+ lz4==4.3.2
102
+ Markdown==3.4.3
103
+ markdown-it-py==2.2.0
104
+ MarkupSafe==2.1.2
105
+ marshmallow==3.19.0
106
+ marshmallow-enum==1.5.1
107
+ matplotlib==3.7.1
108
+ matplotlib-inline==0.1.6
109
+ mdit-py-plugins==0.3.3
110
+ mdurl==0.1.2
111
+ mpmath==1.3.0
112
+ msg-parser==1.2.0
113
+ multidict==6.0.4
114
+ mypy-extensions==1.0.0
115
+ nest-asyncio==1.5.6
116
+ networkx==3.1
117
+ nibabel==5.1.0
118
+ nipype==1.8.6
119
+ nltk==3.8.1
120
+ numexpr==2.8.4
121
+ numpy==1.23.5
122
+ olefile==0.46
123
+ openai==0.27.2
124
+ openapi-schema-pydantic==1.2.4
125
+ openpyxl==3.1.2
126
+ openreview-py==1.22.1
127
+ orjson==3.8.10
128
+ outcome==1.2.0
129
+ packaging==23.0
130
+ pandas==1.5.3
131
+ parso==0.8.3
132
+ pdfminer.six==20221105
133
+ pefile==2023.2.7
134
+ pickleshare==0.7.5
135
+ Pillow==9.4.0
136
+ pip==23.1.2
137
+ pipreqs==0.4.13
138
+ platformdirs==3.2.0
139
+ prompt-toolkit==3.0.38
140
+ protobuf==4.23.2
141
+ prov==2.0.0
142
+ psutil==5.9.4
143
+ pure-eval==0.2.2
144
+ pyasn1==0.5.0
145
+ pyasn1-modules==0.3.0
146
+ pycparser==2.21
147
+ pycryptodome==3.17
148
+ pydantic==1.10.7
149
+ pydot==1.4.2
150
+ pydub==0.25.1
151
+ Pygments==2.15.1
152
+ pyinstaller==5.10.1
153
+ pyinstaller-hooks-contrib==2023.2
154
+ PyJWT==2.6.0
155
+ PyMuPDF==1.22.3
156
+ pypandoc==1.11
157
+ pyparsing==3.0.9
158
+ pypdf==3.9.0
159
+ PyPDF2==3.0.1
160
+ pypdfium2==4.12.0
161
+ pyrsistent==0.19.3
162
+ PySocks==1.7.1
163
+ python-dateutil==2.8.2
164
+ python-docx==0.8.11
165
+ python-dotenv==1.0.0
166
+ python-magic==0.4.27
167
+ python-multipart==0.0.6
168
+ python-pptx==0.6.21
169
+ pytz==2023.3
170
+ pywin32==304
171
+ pywin32-ctypes==0.2.0
172
+ pyxnat==1.5
173
+ PyYAML==6.0
174
+ pyzmq==25.0.2
175
+ rdflib==6.3.2
176
+ regex==2023.5.5
177
+ requests==2.28.2
178
+ rfc3986==1.5.0
179
+ rich==13.0.1
180
+ rsa==4.9
181
+ s3transfer==0.6.0
182
+ scholarly==1.7.11
183
+ scipy==1.10.1
184
+ selenium==4.9.1
185
+ semantic-version==2.10.0
186
+ setuptools==65.5.1
187
+ sgmllib3k==1.0.0
188
+ shutup==0.2.0
189
+ simplejson==3.19.1
190
+ six==1.16.0
191
+ sniffio==1.3.0
192
+ sortedcontainers==2.4.0
193
+ soupsieve==2.4.1
194
+ Sphinx==6.2.1
195
+ sphinx-rtd-theme==1.2.0
196
+ sphinxcontrib-applehelp==1.0.4
197
+ sphinxcontrib-devhelp==1.0.2
198
+ sphinxcontrib-htmlhelp==2.0.1
199
+ sphinxcontrib-jquery==4.1
200
+ sphinxcontrib-jsmath==1.0.1
201
+ sphinxcontrib-qthelp==1.0.3
202
+ sphinxcontrib-serializinghtml==1.1.5
203
+ SQLAlchemy==2.0.12
204
+ stack-data==0.6.2
205
+ starlette==0.26.1
206
+ sympy==1.12
207
+ tenacity==8.2.2
208
+ tiktoken==0.4.0
209
+ tld==0.13
210
+ toolz==0.12.0
211
+ torch==2.0.1+cu117
212
+ torchvision==0.15.2+cu117
213
+ tornado==6.2
214
+ tqdm==4.65.0
215
+ traitlets==5.9.0
216
+ traits==6.3.2
217
+ trio==0.22.0
218
+ trio-websocket==0.10.2
219
+ typer==0.9.0
220
+ typing_extensions==4.5.0
221
+ typing-inspect==0.8.0
222
+ tzdata==2023.3
223
+ uc-micro-py==1.0.1
224
+ unstructured==0.6.10
225
+ urllib3==1.26.15
226
+ uvicorn==0.21.1
227
+ wcwidth==0.2.6
228
+ websockets==11.0.2
229
+ wget==3.2
230
+ wheel==0.40.0
231
+ whichcraft==0.6.1
232
+ wikipedia==1.4.0
233
+ win32-setctime==1.1.0
234
+ wrapt==1.14.1
235
+ wsproto==1.2.0
236
+ xlrd==2.0.1
237
+ XlsxWriter==3.1.1
238
+ yarg==0.1.9
239
+ yarl==1.8.2
240
+ zipp==3.15.0
241
+ zope.event==4.6
242
+ zope.interface==6.0