C2MV commited on
Commit
7142062
·
verified ·
1 Parent(s): 898b6d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +419 -155
app.py CHANGED
@@ -10,18 +10,21 @@ from urllib.parse import quote, urlencode
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
 
 
13
 
14
  # Configure logging
15
- logging.basicConfig(level=logging.INFO,
16
  format='%(asctime)s - %(levelname)s: %(message)s')
17
  logger = logging.getLogger(__name__)
18
 
 
19
  class PaperDownloader:
20
  def __init__(self, output_dir='papers'):
21
  self.output_dir = output_dir
22
  os.makedirs(output_dir, exist_ok=True)
23
-
24
- # Updated download sources
25
  self.download_sources = [
26
  'https://sci-hub.ee/',
27
  'https://sci-hub.st/',
@@ -35,15 +38,222 @@ class PaperDownloader:
35
  # Request headers
36
  self.headers = {
37
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
38
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
 
39
  }
40
 
41
  def clean_doi(self, doi):
42
  """Clean and encode DOI for URL"""
43
  if not isinstance(doi, str):
44
- return None
45
  return quote(doi.strip()) if doi else None
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def download_paper_scihub(self, doi):
48
  """Improved method to download paper from Sci-Hub"""
49
  if not doi:
@@ -53,102 +263,101 @@ class PaperDownloader:
53
  for base_url in self.download_sources:
54
  try:
55
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
56
-
57
  # Request with more tolerance
58
- response = requests.get(scihub_url,
59
- headers=self.headers,
60
- allow_redirects=True,
61
  timeout=15)
62
-
63
  # Search for multiple PDF URL patterns
64
  pdf_patterns = [
65
  r'(https?://[^\s<>"]+?\.pdf)',
66
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
67
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
68
  ]
69
-
70
  pdf_urls = []
71
  for pattern in pdf_patterns:
72
  pdf_urls.extend(re.findall(pattern, response.text))
73
-
74
  # Try downloading from found URLs
75
  for pdf_url in pdf_urls:
76
  try:
77
- pdf_response = requests.get(pdf_url,
78
- headers=self.headers,
79
  timeout=10)
80
-
81
  # Verify if it's a PDF
82
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
83
  logger.debug(f"Found PDF from: {pdf_url}")
84
  return pdf_response.content
85
  except Exception as e:
86
  logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
87
-
88
  except Exception as e:
89
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
90
-
91
  return None
92
-
93
  def download_paper_libgen(self, doi):
94
- """Download from Libgen, handles the query and the redirection"""
95
- if not doi:
96
- return None
97
 
98
- base_url = 'https://libgen.rs/scimag/'
99
- try:
100
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
101
- response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
102
- response.raise_for_status()
103
-
104
- if "No results" in response.text:
105
- logger.debug(f"No results for DOI: {doi} on libgen")
106
- return None
107
-
108
- soup = BeautifulSoup(response.text, 'html.parser')
109
-
110
- # Find the link using a specific selector
111
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
112
-
113
- if links:
114
- link = links[0]
115
- pdf_url = link['href']
116
- pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
117
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
118
- logger.debug(f"Found PDF from: {pdf_url}")
119
- return pdf_response.content
120
 
121
- except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
123
- return None
124
-
125
  def download_paper_google_scholar(self, doi):
126
  """Search google scholar to find an article with the given doi, try to get the pdf"""
127
  if not doi:
128
  return None
129
-
130
  try:
131
-
132
- query = f'doi:"{doi}"'
133
- params = {'q': query}
134
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
135
-
136
- response = requests.get(url, headers = self.headers, timeout = 10)
137
- response.raise_for_status()
138
-
139
- soup = BeautifulSoup(response.text, 'html.parser')
140
-
141
- # Find any links with [PDF]
142
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
143
-
144
- if links:
145
- pdf_url = links[0]['href']
146
- pdf_response = requests.get(pdf_url, headers = self.headers, timeout=10)
147
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
148
- logger.debug(f"Found PDF from: {pdf_url}")
149
- return pdf_response.content
150
  except Exception as e:
151
- logger.debug(f"Google Scholar error for {doi}: {e}")
152
 
153
  return None
154
 
@@ -156,16 +365,16 @@ class PaperDownloader:
156
  """Alternative search method using Crossref"""
157
  if not doi:
158
  return None
159
-
160
  try:
161
  # Search for open access link
162
  url = f"https://api.crossref.org/works/{doi}"
163
  response = requests.get(url, headers=self.headers, timeout=10)
164
-
165
  if response.status_code == 200:
166
  data = response.json()
167
  work = data.get('message', {})
168
-
169
  # Search for open access links
170
  links = work.get('link', [])
171
  for link in links:
@@ -176,42 +385,41 @@ class PaperDownloader:
176
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
177
  logger.debug(f"Found PDF from: {pdf_url}")
178
  return pdf_response.content
179
-
180
  except Exception as e:
181
  logger.debug(f"Crossref error for {doi}: {e}")
182
-
183
  return None
184
-
185
 
186
  def download_with_retry(self, doi, max_retries=3, initial_delay=2):
187
- """Downloads a paper using multiple strategies with exponential backoff"""
188
- pdf_content = None
189
- retries = 0
190
- delay = initial_delay
191
-
192
- while retries < max_retries and not pdf_content:
193
- try:
194
- pdf_content = (
195
- self.download_paper_scihub(doi) or
196
- self.download_paper_libgen(doi) or
197
- self.download_paper_google_scholar(doi) or
198
- self.download_paper_crossref(doi)
199
-
200
- )
201
-
202
- if pdf_content:
203
- return pdf_content
204
- except Exception as e:
205
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
206
-
207
- if not pdf_content:
208
- retries += 1
209
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
210
- time.sleep(delay)
211
- delay *= 2 # Exponential backoff
212
-
213
- return None
214
-
215
  def download_single_doi(self, doi):
216
  """Downloads a single paper using a DOI"""
217
  if not doi:
@@ -219,10 +427,10 @@ class PaperDownloader:
219
 
220
  try:
221
  pdf_content = self.download_with_retry(doi)
222
-
223
  if pdf_content:
224
  if doi is None:
225
- return None, "Error: DOI not provided", "Error: DOI not provided"
226
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
227
  filepath = os.path.join(self.output_dir, filename)
228
  with open(filepath, 'wb') as f:
@@ -236,12 +444,12 @@ class PaperDownloader:
236
  except Exception as e:
237
  logger.error(f"Error processing {doi}: {e}")
238
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
239
-
240
  def download_multiple_dois(self, dois_text):
241
  """Downloads multiple papers from a list of DOIs"""
242
  if not dois_text:
243
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
244
-
245
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
246
  if not dois:
247
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
@@ -252,12 +460,12 @@ class PaperDownloader:
252
  for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
253
  filepath, success_message, fail_message = self.download_single_doi(doi)
254
  if filepath:
255
- # Unique filename for zip
256
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
257
- filepath_unique = os.path.join(self.output_dir, filename)
258
- os.rename(filepath,filepath_unique)
259
- downloaded_files.append(filepath_unique)
260
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
261
 
262
  else:
263
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
@@ -271,23 +479,22 @@ class PaperDownloader:
271
 
272
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
273
 
274
-
275
  def process_bibtex(self, bib_file):
276
  """Process BibTeX file and download papers with multiple strategies"""
277
  # Read BibTeX file content from the uploaded object
278
  try:
279
  with open(bib_file.name, 'r', encoding='utf-8') as f:
280
- bib_content = f.read()
281
  except Exception as e:
282
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
283
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
284
 
285
  # Parse BibTeX data
286
  try:
287
  bib_database = bibtexparser.loads(bib_content)
288
  except Exception as e:
289
- logger.error(f"Error parsing BibTeX data: {e}")
290
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
291
 
292
  # Extract DOIs
293
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
@@ -303,17 +510,79 @@ class PaperDownloader:
303
  try:
304
  # Try to download with multiple methods with retries
305
  pdf_content = self.download_with_retry(doi)
306
-
307
  # Save PDF
308
  if pdf_content:
309
  if doi is None:
310
  return None, "Error: DOI not provided", "Error: DOI not provided", None
311
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
312
  filepath = os.path.join(self.output_dir, filename)
313
-
314
  with open(filepath, 'wb') as f:
315
  f.write(pdf_content)
316
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  downloaded_files.append(filepath)
318
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
319
  logger.info(f"Successfully downloaded: {filename}")
@@ -338,23 +607,22 @@ def create_gradio_interface():
338
  """Create Gradio interface for Paper Downloader"""
339
  downloader = PaperDownloader()
340
 
341
- def download_papers(bib_file, doi_input, dois_input):
342
  if bib_file:
343
  # Check file type
344
  if not bib_file.name.lower().endswith('.bib'):
345
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
346
 
347
- zip_path, downloaded_dois, failed_dois, _ = downloader.process_bibtex(bib_file)
348
  return zip_path, downloaded_dois, failed_dois, None
349
  elif doi_input:
350
  filepath, message, failed_doi = downloader.download_single_doi(doi_input)
351
  return None, message, failed_doi, filepath
352
  elif dois_input:
353
- zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
354
- return zip_path, downloaded_dois, failed_dois, None
355
  else:
356
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
357
-
358
 
359
  # Gradio Interface
360
  interface = gr.Interface(
@@ -366,24 +634,21 @@ def create_gradio_interface():
366
  ],
367
  outputs=[
368
  gr.File(label="Download Papers (ZIP) or Single PDF"),
369
- gr.HTML(label="""
370
- <div style='padding-bottom: 5px; font-weight: bold;'>
371
- Enter Single DOI
372
- </div>
373
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
374
- <div style='padding-bottom: 5px; font-weight: bold;'>
375
- Downloaded DOIs
376
- </div>
377
  <div id="downloaded-dois"></div>
378
- </div>
379
  """),
380
- gr.HTML(label="""
 
 
 
381
  <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
382
- <div style='padding-bottom: 5px; font-weight: bold;'>
383
- Failed DOIs
384
- </div>
385
  <div id="failed-dois"></div>
386
- </div>
387
  """),
388
  gr.File(label="Downloaded Single PDF")
389
  ],
@@ -391,11 +656,11 @@ def create_gradio_interface():
391
  description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
392
  theme="Hev832/Applio",
393
  examples=[
394
- ["example.bib", None, None], # Bibtex File
395
- [None, "10.1038/nature12373", None], # Single DOI
396
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
397
- ],
398
- css="""
399
  .gradio-container {
400
  background-color: black;
401
  }
@@ -412,34 +677,33 @@ def create_gradio_interface():
412
  color: #007bff; /* Blue color for hyperlinks */
413
  }
414
  """,
415
- cache_examples = False,
416
  )
417
-
418
  # Add Javascript to update HTML
419
  interface.load = """
420
  function(downloaded_dois, failed_dois){
421
- let downloaded_html = '<ul>';
422
  downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
423
- downloaded_html += '<li>' + doi + '</li>';
424
  });
425
- downloaded_html += '</ul>';
426
  document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
427
 
428
- let failed_html = '<ul>';
429
  failed_dois.split('\\n').filter(Boolean).forEach(doi => {
430
- failed_html += '<li>' + doi + '</li>';
431
  });
432
- failed_html += '</ul>';
433
  document.querySelector("#failed-dois").innerHTML = failed_html;
434
  return [downloaded_html, failed_html];
435
-
436
  }
437
  """
438
  return interface
439
 
 
440
  def main():
441
  interface = create_gradio_interface()
442
  interface.launch(share=True)
443
 
 
444
  if __name__ == "__main__":
445
- main()
 
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
13
+ import asyncio
14
+ import aiohttp
15
 
16
  # Configure logging
17
+ logging.basicConfig(level=logging.INFO,
18
  format='%(asctime)s - %(levelname)s: %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
21
+
22
  class PaperDownloader:
23
  def __init__(self, output_dir='papers'):
24
  self.output_dir = output_dir
25
  os.makedirs(output_dir, exist_ok=True)
26
+
27
+ # Updated download sources
28
  self.download_sources = [
29
  'https://sci-hub.ee/',
30
  'https://sci-hub.st/',
 
38
  # Request headers
39
  self.headers = {
40
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
41
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
+ 'Accept-Language': 'en-US,en;q=0.9',
43
  }
44
 
45
  def clean_doi(self, doi):
46
  """Clean and encode DOI for URL"""
47
  if not isinstance(doi, str):
48
+ return None
49
  return quote(doi.strip()) if doi else None
50
 
51
+ async def fetch_with_headers(self, session, url, timeout=10):
52
+ """Utility method to fetch an URL with headers and timeout"""
53
+ try:
54
+ async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
+ response.raise_for_status()
56
+ return await response.text(), response.headers
57
+ except Exception as e:
58
+ logger.debug(f"Error fetching {url}: {e}")
59
+ return None, None
60
+
61
+
62
+ async def download_paper_direct_doi_async(self, session, doi):
63
+ """Attempt to download the pdf from the landing page of the doi"""
64
+ if not doi:
65
+ return None
66
+
67
+ try:
68
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
+ if not text:
71
+ return None
72
+
73
+ pdf_patterns = [
74
+ r'(https?://[^\s<>"]+?\.pdf)',
75
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
+ ]
78
+
79
+ pdf_urls = []
80
+ for pattern in pdf_patterns:
81
+ pdf_urls.extend(re.findall(pattern, text))
82
+
83
+ for pdf_url in pdf_urls:
84
+ try:
85
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
+ logger.debug(f"Found PDF from: {pdf_url}")
88
+ return await pdf_response.read()
89
+ except Exception as e:
90
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
+
92
+
93
+ except Exception as e:
94
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
+
96
+ return None
97
+
98
+ async def download_paper_scihub_async(self, session, doi):
99
+ """Improved method to download paper from Sci-Hub using async requests"""
100
+ if not doi:
101
+ logger.warning("DOI not provided")
102
+ return None
103
+
104
+ for base_url in self.download_sources:
105
+ try:
106
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
+ text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
108
+ if not text:
109
+ continue
110
+
111
+ # Search for multiple PDF URL patterns
112
+ pdf_patterns = [
113
+ r'(https?://[^\s<>"]+?\.pdf)',
114
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
115
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
116
+ ]
117
+
118
+ pdf_urls = []
119
+ for pattern in pdf_patterns:
120
+ pdf_urls.extend(re.findall(pattern, text))
121
+
122
+ # Try downloading from found URLs
123
+ for pdf_url in pdf_urls:
124
+ try:
125
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
+ # Verify if it's a PDF
127
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
+ logger.debug(f"Found PDF from: {pdf_url}")
129
+ return await pdf_response.read()
130
+ except Exception as e:
131
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
+
133
+ except Exception as e:
134
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
135
+
136
+ return None
137
+
138
+ async def download_paper_libgen_async(self, session, doi):
139
+ """Download from Libgen, handles the query and the redirection"""
140
+ if not doi:
141
+ return None
142
+
143
+ base_url = 'https://libgen.rs/scimag/'
144
+ try:
145
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
146
+ text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
147
+
148
+ if not text or "No results" in text:
149
+ logger.debug(f"No results for DOI: {doi} on libgen")
150
+ return None
151
+
152
+ soup = BeautifulSoup(text, 'html.parser')
153
+
154
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
155
+
156
+ if links:
157
+ link = links[0]
158
+ pdf_url = link['href']
159
+ pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
+ logger.debug(f"Found PDF from: {pdf_url}")
162
+ return await pdf_response.read()
163
+ except Exception as e:
164
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
+ return None
166
+
167
+ async def download_paper_google_scholar_async(self, session, doi):
168
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
169
+ if not doi:
170
+ return None
171
+
172
+ try:
173
+ query = f'doi:"{doi}"'
174
+ params = {'q': query}
175
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
176
+
177
+ text, headers = await self.fetch_with_headers(session, url, timeout=10)
178
+ if not text:
179
+ return None
180
+
181
+ soup = BeautifulSoup(text, 'html.parser')
182
+
183
+ # Find any links with [PDF]
184
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
185
+
186
+ if links:
187
+ pdf_url = links[0]['href']
188
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
+ logger.debug(f"Found PDF from: {pdf_url}")
191
+ return await pdf_response.read()
192
+ except Exception as e:
193
+ logger.debug(f"Google Scholar error for {doi}: {e}")
194
+
195
+ return None
196
+
197
+ async def download_paper_crossref_async(self, session, doi):
198
+ """Alternative search method using Crossref"""
199
+ if not doi:
200
+ return None
201
+
202
+ try:
203
+ # Search for open access link
204
+ url = f"https://api.crossref.org/works/{doi}"
205
+ response = await session.get(url, headers=self.headers, timeout=10)
206
+
207
+ if response.status == 200:
208
+ data = await response.json()
209
+ work = data.get('message', {})
210
+
211
+ # Search for open access links
212
+ links = work.get('link', [])
213
+ for link in links:
214
+ if link.get('content-type') == 'application/pdf':
215
+ pdf_url = link.get('URL')
216
+ if pdf_url:
217
+ pdf_response = await session.get(pdf_url, headers=self.headers)
218
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
219
+ logger.debug(f"Found PDF from: {pdf_url}")
220
+ return await pdf_response.read()
221
+
222
+ except Exception as e:
223
+ logger.debug(f"Crossref error for {doi}: {e}")
224
+
225
+ return None
226
+
227
+ async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
228
+ """Downloads a paper using multiple strategies with exponential backoff and async requests"""
229
+ pdf_content = None
230
+ retries = 0
231
+ delay = initial_delay
232
+
233
+ async with aiohttp.ClientSession() as session:
234
+ while retries < max_retries and not pdf_content:
235
+ try:
236
+ pdf_content = (
237
+ await self.download_paper_direct_doi_async(session, doi) or
238
+ await self.download_paper_scihub_async(session, doi) or
239
+ await self.download_paper_libgen_async(session, doi) or
240
+ await self.download_paper_google_scholar_async(session, doi) or
241
+ await self.download_paper_crossref_async(session, doi)
242
+
243
+ )
244
+ if pdf_content:
245
+ return pdf_content
246
+ except Exception as e:
247
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
248
+
249
+ if not pdf_content:
250
+ retries += 1
251
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
252
+ await asyncio.sleep(delay)
253
+ delay *= 2 # Exponential backoff
254
+
255
+ return None
256
+
257
  def download_paper_scihub(self, doi):
258
  """Improved method to download paper from Sci-Hub"""
259
  if not doi:
 
263
  for base_url in self.download_sources:
264
  try:
265
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
+
267
  # Request with more tolerance
268
+ response = requests.get(scihub_url,
269
+ headers=self.headers,
270
+ allow_redirects=True,
271
  timeout=15)
272
+
273
  # Search for multiple PDF URL patterns
274
  pdf_patterns = [
275
  r'(https?://[^\s<>"]+?\.pdf)',
276
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
  ]
279
+
280
  pdf_urls = []
281
  for pattern in pdf_patterns:
282
  pdf_urls.extend(re.findall(pattern, response.text))
283
+
284
  # Try downloading from found URLs
285
  for pdf_url in pdf_urls:
286
  try:
287
+ pdf_response = requests.get(pdf_url,
288
+ headers=self.headers,
289
  timeout=10)
290
+
291
  # Verify if it's a PDF
292
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
  logger.debug(f"Found PDF from: {pdf_url}")
294
  return pdf_response.content
295
  except Exception as e:
296
  logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
+
298
  except Exception as e:
299
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
+
301
  return None
302
+
303
  def download_paper_libgen(self, doi):
304
+ """Download from Libgen, handles the query and the redirection"""
305
+ if not doi:
306
+ return None
307
 
308
+ base_url = 'https://libgen.rs/scimag/'
309
+ try:
310
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
+ response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
+ if "No results" in response.text:
315
+ logger.debug(f"No results for DOI: {doi} on libgen")
316
+ return None
317
+
318
+ soup = BeautifulSoup(response.text, 'html.parser')
319
+
320
+ # Find the link using a specific selector
321
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
322
+
323
+ if links:
324
+ link = links[0]
325
+ pdf_url = link['href']
326
+ pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
+ logger.debug(f"Found PDF from: {pdf_url}")
329
+ return pdf_response.content
330
+
331
+ except Exception as e:
332
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
+ return None
334
+
335
  def download_paper_google_scholar(self, doi):
336
  """Search google scholar to find an article with the given doi, try to get the pdf"""
337
  if not doi:
338
  return None
339
+
340
  try:
341
+ query = f'doi:"{doi}"'
342
+ params = {'q': query}
343
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
+
345
+ response = requests.get(url, headers=self.headers, timeout=10)
346
+ response.raise_for_status()
347
+
348
+ soup = BeautifulSoup(response.text, 'html.parser')
349
+
350
+ # Find any links with [PDF]
351
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
352
+
353
+ if links:
354
+ pdf_url = links[0]['href']
355
+ pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
+ logger.debug(f"Found PDF from: {pdf_url}")
358
+ return pdf_response.content
 
359
  except Exception as e:
360
+ logger.debug(f"Google Scholar error for {doi}: {e}")
361
 
362
  return None
363
 
 
365
  """Alternative search method using Crossref"""
366
  if not doi:
367
  return None
368
+
369
  try:
370
  # Search for open access link
371
  url = f"https://api.crossref.org/works/{doi}"
372
  response = requests.get(url, headers=self.headers, timeout=10)
373
+
374
  if response.status_code == 200:
375
  data = response.json()
376
  work = data.get('message', {})
377
+
378
  # Search for open access links
379
  links = work.get('link', [])
380
  for link in links:
 
385
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
  logger.debug(f"Found PDF from: {pdf_url}")
387
  return pdf_response.content
388
+
389
  except Exception as e:
390
  logger.debug(f"Crossref error for {doi}: {e}")
391
+
392
  return None
 
393
 
394
  def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
+ """Downloads a paper using multiple strategies with exponential backoff"""
396
+ pdf_content = None
397
+ retries = 0
398
+ delay = initial_delay
399
+
400
+ while retries < max_retries and not pdf_content:
401
+ try:
402
+ pdf_content = (
403
+ self.download_paper_scihub(doi) or
404
+ self.download_paper_libgen(doi) or
405
+ self.download_paper_google_scholar(doi) or
406
+ self.download_paper_crossref(doi)
407
+
408
+ )
409
+
410
+ if pdf_content:
411
+ return pdf_content
412
+ except Exception as e:
413
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
+
415
+ if not pdf_content:
416
+ retries += 1
417
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
+ time.sleep(delay)
419
+ delay *= 2 # Exponential backoff
420
+
421
+ return None
422
+
423
  def download_single_doi(self, doi):
424
  """Downloads a single paper using a DOI"""
425
  if not doi:
 
427
 
428
  try:
429
  pdf_content = self.download_with_retry(doi)
430
+
431
  if pdf_content:
432
  if doi is None:
433
+ return None, "Error: DOI not provided", "Error: DOI not provided"
434
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
  filepath = os.path.join(self.output_dir, filename)
436
  with open(filepath, 'wb') as f:
 
444
  except Exception as e:
445
  logger.error(f"Error processing {doi}: {e}")
446
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
447
+
448
  def download_multiple_dois(self, dois_text):
449
  """Downloads multiple papers from a list of DOIs"""
450
  if not dois_text:
451
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
452
+
453
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
454
  if not dois:
455
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
 
460
  for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
461
  filepath, success_message, fail_message = self.download_single_doi(doi)
462
  if filepath:
463
+ # Unique filename for zip
464
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
465
+ filepath_unique = os.path.join(self.output_dir, filename)
466
+ os.rename(filepath, filepath_unique)
467
+ downloaded_files.append(filepath_unique)
468
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
469
 
470
  else:
471
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
 
479
 
480
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
481
 
 
482
  def process_bibtex(self, bib_file):
483
  """Process BibTeX file and download papers with multiple strategies"""
484
  # Read BibTeX file content from the uploaded object
485
  try:
486
  with open(bib_file.name, 'r', encoding='utf-8') as f:
487
+ bib_content = f.read()
488
  except Exception as e:
489
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
490
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
491
 
492
  # Parse BibTeX data
493
  try:
494
  bib_database = bibtexparser.loads(bib_content)
495
  except Exception as e:
496
+ logger.error(f"Error parsing BibTeX data: {e}")
497
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
498
 
499
  # Extract DOIs
500
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
 
510
  try:
511
  # Try to download with multiple methods with retries
512
  pdf_content = self.download_with_retry(doi)
513
+
514
  # Save PDF
515
  if pdf_content:
516
  if doi is None:
517
  return None, "Error: DOI not provided", "Error: DOI not provided", None
518
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
519
  filepath = os.path.join(self.output_dir, filename)
520
+
521
  with open(filepath, 'wb') as f:
522
  f.write(pdf_content)
523
+
524
+ downloaded_files.append(filepath)
525
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
526
+ logger.info(f"Successfully downloaded: {filename}")
527
+ else:
528
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
529
+
530
+ except Exception as e:
531
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
532
+ logger.error(f"Error processing {doi}: {e}")
533
+
534
+ # Create ZIP of downloaded papers
535
+ if downloaded_files:
536
+ zip_filename = 'papers.zip'
537
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
538
+ for file_path in downloaded_files:
539
+ zipf.write(file_path, arcname=os.path.basename(file_path))
540
+ logger.info(f"ZIP file created: {zip_filename}")
541
+
542
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
543
+
544
+ async def process_bibtex_async(self, bib_file):
545
+ """Process BibTeX file and download papers with multiple strategies"""
546
+ # Read BibTeX file content from the uploaded object
547
+ try:
548
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
549
+ bib_content = f.read()
550
+ except Exception as e:
551
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
552
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
553
+
554
+ # Parse BibTeX data
555
+ try:
556
+ bib_database = bibtexparser.loads(bib_content)
557
+ except Exception as e:
558
+ logger.error(f"Error parsing BibTeX data: {e}")
559
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
560
+
561
+ # Extract DOIs
562
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
563
+ logger.info(f"Found {len(dois)} DOIs to download")
564
+
565
+ # Result lists
566
+ downloaded_files = []
567
+ failed_dois = []
568
+ downloaded_links = []
569
+
570
+ # Download PDFs
571
+ for doi in tqdm(dois, desc="Downloading papers"):
572
+ try:
573
+ # Try to download with multiple methods with retries
574
+ pdf_content = await self.download_with_retry_async(doi)
575
+
576
+ # Save PDF
577
+ if pdf_content:
578
+ if doi is None:
579
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
580
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
581
+ filepath = os.path.join(self.output_dir, filename)
582
+
583
+ with open(filepath, 'wb') as f:
584
+ f.write(pdf_content)
585
+
586
  downloaded_files.append(filepath)
587
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
588
  logger.info(f"Successfully downloaded: {filename}")
 
607
  """Create Gradio interface for Paper Downloader"""
608
  downloader = PaperDownloader()
609
 
610
+ async def download_papers(bib_file, doi_input, dois_input):
611
  if bib_file:
612
  # Check file type
613
  if not bib_file.name.lower().endswith('.bib'):
614
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
615
 
616
+ zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file)
617
  return zip_path, downloaded_dois, failed_dois, None
618
  elif doi_input:
619
  filepath, message, failed_doi = downloader.download_single_doi(doi_input)
620
  return None, message, failed_doi, filepath
621
  elif dois_input:
622
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
623
+ return zip_path, downloaded_dois, failed_dois, None
624
  else:
625
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
 
626
 
627
  # Gradio Interface
628
  interface = gr.Interface(
 
634
  ],
635
  outputs=[
636
  gr.File(label="Download Papers (ZIP) or Single PDF"),
637
+ gr.HTML(label="""
638
+ <div style='padding-bottom: 5px; font-weight: bold;'>
639
+ Found DOIs
640
+ </div>
641
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
 
 
 
642
  <div id="downloaded-dois"></div>
643
+ </div>
644
  """),
645
+ gr.HTML(label="""
646
+ <div style='padding-bottom: 5px; font-weight: bold;'>
647
+ Missed DOIs
648
+ </div>
649
  <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
 
 
 
650
  <div id="failed-dois"></div>
651
+ </div>
652
  """),
653
  gr.File(label="Downloaded Single PDF")
654
  ],
 
656
  description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
657
  theme="Hev832/Applio",
658
  examples=[
659
+ ["example.bib", None, None], # Bibtex File
660
+ [None, "10.1038/nature12373", None], # Single DOI
661
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
662
+ ],
663
+ css="""
664
  .gradio-container {
665
  background-color: black;
666
  }
 
677
  color: #007bff; /* Blue color for hyperlinks */
678
  }
679
  """,
680
+ cache_examples=False,
681
  )
682
+
683
  # Add Javascript to update HTML
684
  interface.load = """
685
  function(downloaded_dois, failed_dois){
686
+ let downloaded_html = '';
687
  downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
688
+ downloaded_html += '[' + doi + ']<br>';
689
  });
 
690
  document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
691
 
692
+ let failed_html = '';
693
  failed_dois.split('\\n').filter(Boolean).forEach(doi => {
694
+ failed_html += '[' + doi + ']<br>';
695
  });
 
696
  document.querySelector("#failed-dois").innerHTML = failed_html;
697
  return [downloaded_html, failed_html];
 
698
  }
699
  """
700
  return interface
701
 
702
+
703
  def main():
704
  interface = create_gradio_interface()
705
  interface.launch(share=True)
706
 
707
+
708
  if __name__ == "__main__":
709
+ main()