C2MV commited on
Commit
9bba764
1 Parent(s): 4ff23a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -229
app.py CHANGED
@@ -14,16 +14,17 @@ import asyncio
14
  import aiohttp
15
 
16
  # Configure logging
17
- logging.basicConfig(level=logging.INFO,
18
  format='%(asctime)s - %(levelname)s: %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
 
21
  class PaperDownloader:
22
  def __init__(self, output_dir='papers'):
23
  self.output_dir = output_dir
24
  os.makedirs(output_dir, exist_ok=True)
25
-
26
- # Updated download sources
27
  self.download_sources = [
28
  'https://sci-hub.ee/',
29
  'https://sci-hub.st/',
@@ -40,22 +41,59 @@ class PaperDownloader:
40
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
41
  'Accept-Language': 'en-US,en;q=0.9',
42
  }
43
-
44
  def clean_doi(self, doi):
45
  """Clean and encode DOI for URL"""
46
  if not isinstance(doi, str):
47
- return None
48
  return quote(doi.strip()) if doi else None
49
-
50
  async def fetch_with_headers(self, session, url, timeout=10):
51
  """Utility method to fetch an URL with headers and timeout"""
52
  try:
53
- async with session.get(url, headers=self.headers, timeout=timeout) as response:
54
  response.raise_for_status()
55
  return await response.text(), response.headers
56
  except Exception as e:
57
  logger.debug(f"Error fetching {url}: {e}")
58
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  async def download_paper_scihub_async(self, session, doi):
61
  """Improved method to download paper from Sci-Hub using async requests"""
@@ -68,7 +106,7 @@ class PaperDownloader:
68
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
69
  text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
70
  if not text:
71
- continue
72
 
73
  # Search for multiple PDF URL patterns
74
  pdf_patterns = [
@@ -76,27 +114,27 @@ class PaperDownloader:
76
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
77
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
78
  ]
79
-
80
  pdf_urls = []
81
  for pattern in pdf_patterns:
82
  pdf_urls.extend(re.findall(pattern, text))
83
-
84
  # Try downloading from found URLs
85
  for pdf_url in pdf_urls:
86
  try:
87
  pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
88
  # Verify if it's a PDF
89
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
90
- logger.debug(f"Found PDF from: {pdf_url}")
91
- return await pdf_response.read()
92
  except Exception as e:
93
  logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
94
-
95
  except Exception as e:
96
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
97
-
98
  return None
99
-
100
  async def download_paper_libgen_async(self, session, doi):
101
  """Download from Libgen, handles the query and the redirection"""
102
  if not doi:
@@ -104,56 +142,55 @@ class PaperDownloader:
104
 
105
  base_url = 'https://libgen.rs/scimag/'
106
  try:
107
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
108
- text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
109
-
110
- if not text or "No results" in text:
111
- logger.debug(f"No results for DOI: {doi} on libgen")
112
- return None
113
-
114
- soup = BeautifulSoup(text, 'html.parser')
115
-
116
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
117
-
118
- if links:
119
- link = links[0]
120
- pdf_url = link['href']
121
- pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
122
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
123
- logger.debug(f"Found PDF from: {pdf_url}")
124
- return await pdf_response.read()
125
  except Exception as e:
126
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
127
  return None
128
-
129
  async def download_paper_google_scholar_async(self, session, doi):
130
  """Search google scholar to find an article with the given doi, try to get the pdf"""
131
  if not doi:
132
  return None
133
-
134
  try:
135
-
136
- query = f'doi:"{doi}"'
137
- params = {'q': query}
138
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
139
-
140
- text, headers = await self.fetch_with_headers(session, url, timeout = 10)
141
- if not text:
142
- return None
143
-
144
- soup = BeautifulSoup(text, 'html.parser')
145
-
146
- # Find any links with [PDF]
147
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
148
-
149
- if links:
150
- pdf_url = links[0]['href']
151
- pdf_response = await session.get(pdf_url, headers = self.headers, timeout=10)
152
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
153
- logger.debug(f"Found PDF from: {pdf_url}")
154
- return await pdf_response.read()
155
  except Exception as e:
156
- logger.debug(f"Google Scholar error for {doi}: {e}")
157
 
158
  return None
159
 
@@ -161,7 +198,7 @@ class PaperDownloader:
161
  """Alternative search method using Crossref"""
162
  if not doi:
163
  return None
164
-
165
  try:
166
  # Search for open access link
167
  url = f"https://api.crossref.org/works/{doi}"
@@ -170,53 +207,53 @@ class PaperDownloader:
170
  if response.status == 200:
171
  data = await response.json()
172
  work = data.get('message', {})
173
-
174
  # Search for open access links
175
  links = work.get('link', [])
176
  for link in links:
177
  if link.get('content-type') == 'application/pdf':
178
  pdf_url = link.get('URL')
179
  if pdf_url:
180
- pdf_response = await session.get(pdf_url, headers = self.headers)
181
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
182
- logger.debug(f"Found PDF from: {pdf_url}")
183
- return await pdf_response.read()
184
-
185
  except Exception as e:
186
  logger.debug(f"Crossref error for {doi}: {e}")
187
-
188
  return None
189
-
190
 
191
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
192
- """Downloads a paper using multiple strategies with exponential backoff and async requests"""
193
- pdf_content = None
194
- retries = 0
195
- delay = initial_delay
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
- async with aiohttp.ClientSession() as session:
198
- while retries < max_retries and not pdf_content:
199
- try:
200
- pdf_content = (
201
- await self.download_paper_scihub_async(session, doi) or
202
- await self.download_paper_libgen_async(session, doi) or
203
- await self.download_paper_google_scholar_async(session, doi) or
204
- await self.download_paper_crossref_async(session, doi)
205
-
206
- )
207
- if pdf_content:
208
- return pdf_content
209
- except Exception as e:
210
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
211
-
212
- if not pdf_content:
213
- retries += 1
214
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
215
- await asyncio.sleep(delay)
216
- delay *= 2 # Exponential backoff
217
-
218
- return None
219
-
220
  def download_paper_scihub(self, doi):
221
  """Improved method to download paper from Sci-Hub"""
222
  if not doi:
@@ -226,102 +263,101 @@ class PaperDownloader:
226
  for base_url in self.download_sources:
227
  try:
228
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
229
-
230
  # Request with more tolerance
231
- response = requests.get(scihub_url,
232
- headers=self.headers,
233
- allow_redirects=True,
234
  timeout=15)
235
-
236
  # Search for multiple PDF URL patterns
237
  pdf_patterns = [
238
  r'(https?://[^\s<>"]+?\.pdf)',
239
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
240
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
241
  ]
242
-
243
  pdf_urls = []
244
  for pattern in pdf_patterns:
245
  pdf_urls.extend(re.findall(pattern, response.text))
246
-
247
  # Try downloading from found URLs
248
  for pdf_url in pdf_urls:
249
  try:
250
- pdf_response = requests.get(pdf_url,
251
- headers=self.headers,
252
  timeout=10)
253
-
254
  # Verify if it's a PDF
255
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
256
  logger.debug(f"Found PDF from: {pdf_url}")
257
  return pdf_response.content
258
  except Exception as e:
259
  logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
260
-
261
  except Exception as e:
262
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
263
-
264
  return None
265
-
266
  def download_paper_libgen(self, doi):
267
- """Download from Libgen, handles the query and the redirection"""
268
- if not doi:
269
- return None
270
 
271
- base_url = 'https://libgen.rs/scimag/'
272
- try:
273
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
274
- response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
275
- response.raise_for_status()
276
-
277
- if "No results" in response.text:
278
- logger.debug(f"No results for DOI: {doi} on libgen")
279
- return None
280
-
281
- soup = BeautifulSoup(response.text, 'html.parser')
282
-
283
- # Find the link using a specific selector
284
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
285
-
286
- if links:
287
- link = links[0]
288
- pdf_url = link['href']
289
- pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
290
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
291
- logger.debug(f"Found PDF from: {pdf_url}")
292
- return pdf_response.content
293
 
294
- except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
296
- return None
297
-
298
  def download_paper_google_scholar(self, doi):
299
  """Search google scholar to find an article with the given doi, try to get the pdf"""
300
  if not doi:
301
  return None
302
-
303
  try:
304
-
305
- query = f'doi:"{doi}"'
306
- params = {'q': query}
307
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
308
-
309
- response = requests.get(url, headers = self.headers, timeout = 10)
310
- response.raise_for_status()
311
-
312
- soup = BeautifulSoup(response.text, 'html.parser')
313
-
314
- # Find any links with [PDF]
315
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
316
-
317
- if links:
318
- pdf_url = links[0]['href']
319
- pdf_response = requests.get(pdf_url, headers = self.headers, timeout=10)
320
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
321
- logger.debug(f"Found PDF from: {pdf_url}")
322
- return pdf_response.content
323
  except Exception as e:
324
- logger.debug(f"Google Scholar error for {doi}: {e}")
325
 
326
  return None
327
 
@@ -329,16 +365,16 @@ class PaperDownloader:
329
  """Alternative search method using Crossref"""
330
  if not doi:
331
  return None
332
-
333
  try:
334
  # Search for open access link
335
  url = f"https://api.crossref.org/works/{doi}"
336
  response = requests.get(url, headers=self.headers, timeout=10)
337
-
338
  if response.status_code == 200:
339
  data = response.json()
340
  work = data.get('message', {})
341
-
342
  # Search for open access links
343
  links = work.get('link', [])
344
  for link in links:
@@ -349,42 +385,41 @@ class PaperDownloader:
349
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
350
  logger.debug(f"Found PDF from: {pdf_url}")
351
  return pdf_response.content
352
-
353
  except Exception as e:
354
  logger.debug(f"Crossref error for {doi}: {e}")
355
-
356
  return None
357
-
358
 
359
  def download_with_retry(self, doi, max_retries=3, initial_delay=2):
360
- """Downloads a paper using multiple strategies with exponential backoff"""
361
- pdf_content = None
362
- retries = 0
363
- delay = initial_delay
364
-
365
- while retries < max_retries and not pdf_content:
366
- try:
367
- pdf_content = (
368
- self.download_paper_scihub(doi) or
369
- self.download_paper_libgen(doi) or
370
- self.download_paper_google_scholar(doi) or
371
- self.download_paper_crossref(doi)
372
-
373
- )
374
-
375
- if pdf_content:
376
- return pdf_content
377
- except Exception as e:
378
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
379
-
380
- if not pdf_content:
381
- retries += 1
382
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
383
- time.sleep(delay)
384
- delay *= 2 # Exponential backoff
385
-
386
- return None
387
-
388
  def download_single_doi(self, doi):
389
  """Downloads a single paper using a DOI"""
390
  if not doi:
@@ -392,10 +427,10 @@ class PaperDownloader:
392
 
393
  try:
394
  pdf_content = self.download_with_retry(doi)
395
-
396
  if pdf_content:
397
  if doi is None:
398
- return None, "Error: DOI not provided", "Error: DOI not provided"
399
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
400
  filepath = os.path.join(self.output_dir, filename)
401
  with open(filepath, 'wb') as f:
@@ -409,12 +444,12 @@ class PaperDownloader:
409
  except Exception as e:
410
  logger.error(f"Error processing {doi}: {e}")
411
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
412
-
413
  def download_multiple_dois(self, dois_text):
414
  """Downloads multiple papers from a list of DOIs"""
415
  if not dois_text:
416
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
417
-
418
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
419
  if not dois:
420
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
@@ -425,12 +460,12 @@ class PaperDownloader:
425
  for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
426
  filepath, success_message, fail_message = self.download_single_doi(doi)
427
  if filepath:
428
- # Unique filename for zip
429
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
430
- filepath_unique = os.path.join(self.output_dir, filename)
431
- os.rename(filepath,filepath_unique)
432
- downloaded_files.append(filepath_unique)
433
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
434
 
435
  else:
436
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
@@ -444,23 +479,22 @@ class PaperDownloader:
444
 
445
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
446
 
447
-
448
  def process_bibtex(self, bib_file):
449
  """Process BibTeX file and download papers with multiple strategies"""
450
  # Read BibTeX file content from the uploaded object
451
  try:
452
  with open(bib_file.name, 'r', encoding='utf-8') as f:
453
- bib_content = f.read()
454
  except Exception as e:
455
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
456
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
457
 
458
  # Parse BibTeX data
459
  try:
460
  bib_database = bibtexparser.loads(bib_content)
461
  except Exception as e:
462
- logger.error(f"Error parsing BibTeX data: {e}")
463
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
464
 
465
  # Extract DOIs
466
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
@@ -476,17 +510,17 @@ class PaperDownloader:
476
  try:
477
  # Try to download with multiple methods with retries
478
  pdf_content = self.download_with_retry(doi)
479
-
480
  # Save PDF
481
  if pdf_content:
482
  if doi is None:
483
  return None, "Error: DOI not provided", "Error: DOI not provided", None
484
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
485
  filepath = os.path.join(self.output_dir, filename)
486
-
487
  with open(filepath, 'wb') as f:
488
  f.write(pdf_content)
489
-
490
  downloaded_files.append(filepath)
491
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
492
  logger.info(f"Successfully downloaded: {filename}")
@@ -512,17 +546,17 @@ class PaperDownloader:
512
  # Read BibTeX file content from the uploaded object
513
  try:
514
  with open(bib_file.name, 'r', encoding='utf-8') as f:
515
- bib_content = f.read()
516
  except Exception as e:
517
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
518
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
519
 
520
  # Parse BibTeX data
521
  try:
522
  bib_database = bibtexparser.loads(bib_content)
523
  except Exception as e:
524
- logger.error(f"Error parsing BibTeX data: {e}")
525
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
526
 
527
  # Extract DOIs
528
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
@@ -532,23 +566,23 @@ class PaperDownloader:
532
  downloaded_files = []
533
  failed_dois = []
534
  downloaded_links = []
535
-
536
  # Download PDFs
537
  for doi in tqdm(dois, desc="Downloading papers"):
538
  try:
539
  # Try to download with multiple methods with retries
540
  pdf_content = await self.download_with_retry_async(doi)
541
-
542
  # Save PDF
543
  if pdf_content:
544
  if doi is None:
545
  return None, "Error: DOI not provided", "Error: DOI not provided", None
546
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
547
  filepath = os.path.join(self.output_dir, filename)
548
-
549
  with open(filepath, 'wb') as f:
550
  f.write(pdf_content)
551
-
552
  downloaded_files.append(filepath)
553
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
554
  logger.info(f"Successfully downloaded: {filename}")
@@ -566,7 +600,7 @@ class PaperDownloader:
566
  for file_path in downloaded_files:
567
  zipf.write(file_path, arcname=os.path.basename(file_path))
568
  logger.info(f"ZIP file created: {zip_filename}")
569
-
570
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
571
 
572
  def create_gradio_interface():
@@ -585,11 +619,10 @@ def create_gradio_interface():
585
  filepath, message, failed_doi = downloader.download_single_doi(doi_input)
586
  return None, message, failed_doi, filepath
587
  elif dois_input:
588
- zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
589
- return zip_path, downloaded_dois, failed_dois, None
590
  else:
591
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
592
-
593
 
594
  # Gradio Interface
595
  interface = gr.Interface(
@@ -623,11 +656,11 @@ def create_gradio_interface():
623
  description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
624
  theme="Hev832/Applio",
625
  examples=[
626
- ["example.bib", None, None], # Bibtex File
627
- [None, "10.1038/nature12373", None], # Single DOI
628
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
629
- ],
630
- css="""
631
  .gradio-container {
632
  background-color: black;
633
  }
@@ -644,9 +677,9 @@ def create_gradio_interface():
644
  color: #007bff; /* Blue color for hyperlinks */
645
  }
646
  """,
647
- cache_examples = False,
648
  )
649
-
650
  # Add Javascript to update HTML
651
  interface.load = """
652
  function(downloaded_dois, failed_dois){
@@ -655,7 +688,7 @@ def create_gradio_interface():
655
  downloaded_html += '[' + doi + ']<br>';
656
  });
657
  document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
658
-
659
  let failed_html = '';
660
  failed_dois.split('\\n').filter(Boolean).forEach(doi => {
661
  failed_html += '[' + doi + ']<br>';
@@ -666,9 +699,11 @@ def create_gradio_interface():
666
  """
667
  return interface
668
 
 
669
  def main():
670
  interface = create_gradio_interface()
671
  interface.launch(share=True)
672
 
 
673
  if __name__ == "__main__":
674
  main()
 
14
  import aiohttp
15
 
16
  # Configure logging
17
+ logging.basicConfig(level=logging.INFO,
18
  format='%(asctime)s - %(levelname)s: %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
21
+
22
  class PaperDownloader:
23
  def __init__(self, output_dir='papers'):
24
  self.output_dir = output_dir
25
  os.makedirs(output_dir, exist_ok=True)
26
+
27
+ # Updated download sources
28
  self.download_sources = [
29
  'https://sci-hub.ee/',
30
  'https://sci-hub.st/',
 
41
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
  'Accept-Language': 'en-US,en;q=0.9',
43
  }
44
+
45
  def clean_doi(self, doi):
46
  """Clean and encode DOI for URL"""
47
  if not isinstance(doi, str):
48
+ return None
49
  return quote(doi.strip()) if doi else None
50
+
51
  async def fetch_with_headers(self, session, url, timeout=10):
52
  """Utility method to fetch an URL with headers and timeout"""
53
  try:
54
+ async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
  response.raise_for_status()
56
  return await response.text(), response.headers
57
  except Exception as e:
58
  logger.debug(f"Error fetching {url}: {e}")
59
  return None, None
60
+
61
+
62
+ async def download_paper_direct_doi_async(self, session, doi):
63
+ """Attempt to download the pdf from the landing page of the doi"""
64
+ if not doi:
65
+ return None
66
+
67
+ try:
68
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
+ if not text:
71
+ return None
72
+
73
+ pdf_patterns = [
74
+ r'(https?://[^\s<>"]+?\.pdf)',
75
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
+ ]
78
+
79
+ pdf_urls = []
80
+ for pattern in pdf_patterns:
81
+ pdf_urls.extend(re.findall(pattern, text))
82
+
83
+ for pdf_url in pdf_urls:
84
+ try:
85
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
+ logger.debug(f"Found PDF from: {pdf_url}")
88
+ return await pdf_response.read()
89
+ except Exception as e:
90
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
+
92
+
93
+ except Exception as e:
94
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
+
96
+ return None
97
 
98
  async def download_paper_scihub_async(self, session, doi):
99
  """Improved method to download paper from Sci-Hub using async requests"""
 
106
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
  text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
108
  if not text:
109
+ continue
110
 
111
  # Search for multiple PDF URL patterns
112
  pdf_patterns = [
 
114
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
115
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
116
  ]
117
+
118
  pdf_urls = []
119
  for pattern in pdf_patterns:
120
  pdf_urls.extend(re.findall(pattern, text))
121
+
122
  # Try downloading from found URLs
123
  for pdf_url in pdf_urls:
124
  try:
125
  pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
  # Verify if it's a PDF
127
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
+ logger.debug(f"Found PDF from: {pdf_url}")
129
+ return await pdf_response.read()
130
  except Exception as e:
131
  logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
+
133
  except Exception as e:
134
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
135
+
136
  return None
137
+
138
  async def download_paper_libgen_async(self, session, doi):
139
  """Download from Libgen, handles the query and the redirection"""
140
  if not doi:
 
142
 
143
  base_url = 'https://libgen.rs/scimag/'
144
  try:
145
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
146
+ text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
147
+
148
+ if not text or "No results" in text:
149
+ logger.debug(f"No results for DOI: {doi} on libgen")
150
+ return None
151
+
152
+ soup = BeautifulSoup(text, 'html.parser')
153
+
154
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
155
+
156
+ if links:
157
+ link = links[0]
158
+ pdf_url = link['href']
159
+ pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
+ logger.debug(f"Found PDF from: {pdf_url}")
162
+ return await pdf_response.read()
163
  except Exception as e:
164
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
  return None
166
+
167
  async def download_paper_google_scholar_async(self, session, doi):
168
  """Search google scholar to find an article with the given doi, try to get the pdf"""
169
  if not doi:
170
  return None
171
+
172
  try:
173
+ query = f'doi:"{doi}"'
174
+ params = {'q': query}
175
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
176
+
177
+ text, headers = await self.fetch_with_headers(session, url, timeout=10)
178
+ if not text:
179
+ return None
180
+
181
+ soup = BeautifulSoup(text, 'html.parser')
182
+
183
+ # Find any links with [PDF]
184
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
185
+
186
+ if links:
187
+ pdf_url = links[0]['href']
188
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
+ logger.debug(f"Found PDF from: {pdf_url}")
191
+ return await pdf_response.read()
 
192
  except Exception as e:
193
+ logger.debug(f"Google Scholar error for {doi}: {e}")
194
 
195
  return None
196
 
 
198
  """Alternative search method using Crossref"""
199
  if not doi:
200
  return None
201
+
202
  try:
203
  # Search for open access link
204
  url = f"https://api.crossref.org/works/{doi}"
 
207
  if response.status == 200:
208
  data = await response.json()
209
  work = data.get('message', {})
210
+
211
  # Search for open access links
212
  links = work.get('link', [])
213
  for link in links:
214
  if link.get('content-type') == 'application/pdf':
215
  pdf_url = link.get('URL')
216
  if pdf_url:
217
+ pdf_response = await session.get(pdf_url, headers=self.headers)
218
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
219
+ logger.debug(f"Found PDF from: {pdf_url}")
220
+ return await pdf_response.read()
221
+
222
  except Exception as e:
223
  logger.debug(f"Crossref error for {doi}: {e}")
224
+
225
  return None
 
226
 
227
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
228
+ """Downloads a paper using multiple strategies with exponential backoff and async requests"""
229
+ pdf_content = None
230
+ retries = 0
231
+ delay = initial_delay
232
+
233
+ async with aiohttp.ClientSession() as session:
234
+ while retries < max_retries and not pdf_content:
235
+ try:
236
+ pdf_content = (
237
+ await self.download_paper_direct_doi_async(session, doi) or
238
+ await self.download_paper_scihub_async(session, doi) or
239
+ await self.download_paper_libgen_async(session, doi) or
240
+ await self.download_paper_google_scholar_async(session, doi) or
241
+ await self.download_paper_crossref_async(session, doi)
242
+
243
+ )
244
+ if pdf_content:
245
+ return pdf_content
246
+ except Exception as e:
247
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
248
+
249
+ if not pdf_content:
250
+ retries += 1
251
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
252
+ await asyncio.sleep(delay)
253
+ delay *= 2 # Exponential backoff
254
+
255
+ return None
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  def download_paper_scihub(self, doi):
258
  """Improved method to download paper from Sci-Hub"""
259
  if not doi:
 
263
  for base_url in self.download_sources:
264
  try:
265
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
+
267
  # Request with more tolerance
268
+ response = requests.get(scihub_url,
269
+ headers=self.headers,
270
+ allow_redirects=True,
271
  timeout=15)
272
+
273
  # Search for multiple PDF URL patterns
274
  pdf_patterns = [
275
  r'(https?://[^\s<>"]+?\.pdf)',
276
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
  ]
279
+
280
  pdf_urls = []
281
  for pattern in pdf_patterns:
282
  pdf_urls.extend(re.findall(pattern, response.text))
283
+
284
  # Try downloading from found URLs
285
  for pdf_url in pdf_urls:
286
  try:
287
+ pdf_response = requests.get(pdf_url,
288
+ headers=self.headers,
289
  timeout=10)
290
+
291
  # Verify if it's a PDF
292
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
  logger.debug(f"Found PDF from: {pdf_url}")
294
  return pdf_response.content
295
  except Exception as e:
296
  logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
+
298
  except Exception as e:
299
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
+
301
  return None
302
+
303
  def download_paper_libgen(self, doi):
304
+ """Download from Libgen, handles the query and the redirection"""
305
+ if not doi:
306
+ return None
307
 
308
+ base_url = 'https://libgen.rs/scimag/'
309
+ try:
310
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
+ response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
+ if "No results" in response.text:
315
+ logger.debug(f"No results for DOI: {doi} on libgen")
316
+ return None
317
+
318
+ soup = BeautifulSoup(response.text, 'html.parser')
319
+
320
+ # Find the link using a specific selector
321
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
322
+
323
+ if links:
324
+ link = links[0]
325
+ pdf_url = link['href']
326
+ pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
+ logger.debug(f"Found PDF from: {pdf_url}")
329
+ return pdf_response.content
330
+
331
+ except Exception as e:
332
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
+ return None
334
+
335
  def download_paper_google_scholar(self, doi):
336
  """Search google scholar to find an article with the given doi, try to get the pdf"""
337
  if not doi:
338
  return None
339
+
340
  try:
341
+ query = f'doi:"{doi}"'
342
+ params = {'q': query}
343
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
+
345
+ response = requests.get(url, headers=self.headers, timeout=10)
346
+ response.raise_for_status()
347
+
348
+ soup = BeautifulSoup(response.text, 'html.parser')
349
+
350
+ # Find any links with [PDF]
351
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
352
+
353
+ if links:
354
+ pdf_url = links[0]['href']
355
+ pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
+ logger.debug(f"Found PDF from: {pdf_url}")
358
+ return pdf_response.content
 
359
  except Exception as e:
360
+ logger.debug(f"Google Scholar error for {doi}: {e}")
361
 
362
  return None
363
 
 
365
  """Alternative search method using Crossref"""
366
  if not doi:
367
  return None
368
+
369
  try:
370
  # Search for open access link
371
  url = f"https://api.crossref.org/works/{doi}"
372
  response = requests.get(url, headers=self.headers, timeout=10)
373
+
374
  if response.status_code == 200:
375
  data = response.json()
376
  work = data.get('message', {})
377
+
378
  # Search for open access links
379
  links = work.get('link', [])
380
  for link in links:
 
385
  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
  logger.debug(f"Found PDF from: {pdf_url}")
387
  return pdf_response.content
388
+
389
  except Exception as e:
390
  logger.debug(f"Crossref error for {doi}: {e}")
391
+
392
  return None
 
393
 
394
  def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
+ """Downloads a paper using multiple strategies with exponential backoff"""
396
+ pdf_content = None
397
+ retries = 0
398
+ delay = initial_delay
399
+
400
+ while retries < max_retries and not pdf_content:
401
+ try:
402
+ pdf_content = (
403
+ self.download_paper_scihub(doi) or
404
+ self.download_paper_libgen(doi) or
405
+ self.download_paper_google_scholar(doi) or
406
+ self.download_paper_crossref(doi)
407
+
408
+ )
409
+
410
+ if pdf_content:
411
+ return pdf_content
412
+ except Exception as e:
413
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
+
415
+ if not pdf_content:
416
+ retries += 1
417
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
+ time.sleep(delay)
419
+ delay *= 2 # Exponential backoff
420
+
421
+ return None
422
+
423
  def download_single_doi(self, doi):
424
  """Downloads a single paper using a DOI"""
425
  if not doi:
 
427
 
428
  try:
429
  pdf_content = self.download_with_retry(doi)
430
+
431
  if pdf_content:
432
  if doi is None:
433
+ return None, "Error: DOI not provided", "Error: DOI not provided"
434
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
  filepath = os.path.join(self.output_dir, filename)
436
  with open(filepath, 'wb') as f:
 
444
  except Exception as e:
445
  logger.error(f"Error processing {doi}: {e}")
446
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
447
+
448
  def download_multiple_dois(self, dois_text):
449
  """Downloads multiple papers from a list of DOIs"""
450
  if not dois_text:
451
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
452
+
453
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
454
  if not dois:
455
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
 
460
  for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
461
  filepath, success_message, fail_message = self.download_single_doi(doi)
462
  if filepath:
463
+ # Unique filename for zip
464
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
465
+ filepath_unique = os.path.join(self.output_dir, filename)
466
+ os.rename(filepath, filepath_unique)
467
+ downloaded_files.append(filepath_unique)
468
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
469
 
470
  else:
471
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
 
479
 
480
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
481
 
 
482
  def process_bibtex(self, bib_file):
483
  """Process BibTeX file and download papers with multiple strategies"""
484
  # Read BibTeX file content from the uploaded object
485
  try:
486
  with open(bib_file.name, 'r', encoding='utf-8') as f:
487
+ bib_content = f.read()
488
  except Exception as e:
489
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
490
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
491
 
492
  # Parse BibTeX data
493
  try:
494
  bib_database = bibtexparser.loads(bib_content)
495
  except Exception as e:
496
+ logger.error(f"Error parsing BibTeX data: {e}")
497
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
498
 
499
  # Extract DOIs
500
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
 
510
  try:
511
  # Try to download with multiple methods with retries
512
  pdf_content = self.download_with_retry(doi)
513
+
514
  # Save PDF
515
  if pdf_content:
516
  if doi is None:
517
  return None, "Error: DOI not provided", "Error: DOI not provided", None
518
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
519
  filepath = os.path.join(self.output_dir, filename)
520
+
521
  with open(filepath, 'wb') as f:
522
  f.write(pdf_content)
523
+
524
  downloaded_files.append(filepath)
525
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
526
  logger.info(f"Successfully downloaded: {filename}")
 
546
  # Read BibTeX file content from the uploaded object
547
  try:
548
  with open(bib_file.name, 'r', encoding='utf-8') as f:
549
+ bib_content = f.read()
550
  except Exception as e:
551
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
552
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
553
 
554
  # Parse BibTeX data
555
  try:
556
  bib_database = bibtexparser.loads(bib_content)
557
  except Exception as e:
558
+ logger.error(f"Error parsing BibTeX data: {e}")
559
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
560
 
561
  # Extract DOIs
562
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
 
566
  downloaded_files = []
567
  failed_dois = []
568
  downloaded_links = []
569
+
570
  # Download PDFs
571
  for doi in tqdm(dois, desc="Downloading papers"):
572
  try:
573
  # Try to download with multiple methods with retries
574
  pdf_content = await self.download_with_retry_async(doi)
575
+
576
  # Save PDF
577
  if pdf_content:
578
  if doi is None:
579
  return None, "Error: DOI not provided", "Error: DOI not provided", None
580
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
581
  filepath = os.path.join(self.output_dir, filename)
582
+
583
  with open(filepath, 'wb') as f:
584
  f.write(pdf_content)
585
+
586
  downloaded_files.append(filepath)
587
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
588
  logger.info(f"Successfully downloaded: {filename}")
 
600
  for file_path in downloaded_files:
601
  zipf.write(file_path, arcname=os.path.basename(file_path))
602
  logger.info(f"ZIP file created: {zip_filename}")
603
+
604
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
605
 
606
  def create_gradio_interface():
 
619
  filepath, message, failed_doi = downloader.download_single_doi(doi_input)
620
  return None, message, failed_doi, filepath
621
  elif dois_input:
622
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
623
+ return zip_path, downloaded_dois, failed_dois, None
624
  else:
625
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
 
626
 
627
  # Gradio Interface
628
  interface = gr.Interface(
 
656
  description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
657
  theme="Hev832/Applio",
658
  examples=[
659
+ ["example.bib", None, None], # Bibtex File
660
+ [None, "10.1038/nature12373", None], # Single DOI
661
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
662
+ ],
663
+ css="""
664
  .gradio-container {
665
  background-color: black;
666
  }
 
677
  color: #007bff; /* Blue color for hyperlinks */
678
  }
679
  """,
680
+ cache_examples=False,
681
  )
682
+
683
  # Add Javascript to update HTML
684
  interface.load = """
685
  function(downloaded_dois, failed_dois){
 
688
  downloaded_html += '[' + doi + ']<br>';
689
  });
690
  document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
691
+
692
  let failed_html = '';
693
  failed_dois.split('\\n').filter(Boolean).forEach(doi => {
694
  failed_html += '[' + doi + ']<br>';
 
699
  """
700
  return interface
701
 
702
+
703
  def main():
704
  interface = create_gradio_interface()
705
  interface.launch(share=True)
706
 
707
+
708
  if __name__ == "__main__":
709
  main()