davidpengg commited on
Commit
f9b19f4
1 Parent(s): 43e0ac1

error handling

Browse files
Files changed (2) hide show
  1. app.py +8 -1
  2. download_pdf.py +6 -11
app.py CHANGED
@@ -11,6 +11,13 @@ examples = [
11
  "https://indianculture.gov.in/reports-proceedings/report-village-and-cottage-industries-national-committee-development-backward"
12
  ]
13
 
 
 
 
 
 
 
 
14
  with gr.Blocks() as app:
15
  gr.Markdown("# <p align='center'>Extract PDF from indianculture[dot]gov[dot]in</p>")
16
  # with gr.Row():
@@ -25,7 +32,7 @@ with gr.Blocks() as app:
25
  gr.Examples(examples=examples,inputs=landing_page_url,outputs=pdf_file)
26
 
27
  landing_page_url_btrn.click(
28
- download,
29
  inputs=landing_page_url,
30
  outputs=pdf_file
31
  )
 
11
  "https://indianculture.gov.in/reports-proceedings/report-village-and-cottage-industries-national-committee-development-backward"
12
  ]
13
 
14
+ def try_download(url):
15
+ try:
16
+ pdf = download(url)
17
+ return pdf
18
+ except Exception as e:
19
+ raise gr.Error(str(e))
20
+
21
  with gr.Blocks() as app:
22
  gr.Markdown("# <p align='center'>Extract PDF from indianculture[dot]gov[dot]in</p>")
23
  # with gr.Row():
 
32
  gr.Examples(examples=examples,inputs=landing_page_url,outputs=pdf_file)
33
 
34
  landing_page_url_btrn.click(
35
+ try_download,
36
  inputs=landing_page_url,
37
  outputs=pdf_file
38
  )
download_pdf.py CHANGED
@@ -6,22 +6,17 @@ David Peng
6
  import requests
7
  from bs4 import BeautifulSoup as bs
8
  from urllib.parse import unquote
9
- import time
10
  import os
11
 
12
  DEFAULT_TIMEOUT = 10
13
- RETURN_CODE = 0
14
 
15
  # script borrowed from https://github.com/lalitaalaalitah/Scrape_IndianCulture.Gov.In_Release
16
  def download(book_page_url):
17
- while RETURN_CODE == 0 :
18
- try:
19
- book_page_get = requests.get(book_page_url, timeout=DEFAULT_TIMEOUT)
20
- except:
21
- continue
22
- if book_page_get.status_code == 200:
23
- break
24
- time.sleep(10)
25
  book_page_get = requests.get(book_page_url)
26
  parsed_book_page = bs(book_page_get.content, 'html.parser')
27
  class_pdf_in_page = parsed_book_page.find_all('iframe', class_='pdf')
@@ -40,4 +35,4 @@ def download(book_page_url):
40
  os.system(cmd_for_curl)
41
  return pdf_name
42
  else:
43
- return None
 
6
  import requests
7
  from bs4 import BeautifulSoup as bs
8
  from urllib.parse import unquote
 
9
  import os
10
 
11
  DEFAULT_TIMEOUT = 10
 
12
 
13
  # script borrowed from https://github.com/lalitaalaalitah/Scrape_IndianCulture.Gov.In_Release
14
  def download(book_page_url):
15
+ try:
16
+ book_page_get = requests.get(book_page_url, timeout=DEFAULT_TIMEOUT)
17
+ except Exception:
18
+ raise Exception("Bad URL!")
19
+
 
 
 
20
  book_page_get = requests.get(book_page_url)
21
  parsed_book_page = bs(book_page_get.content, 'html.parser')
22
  class_pdf_in_page = parsed_book_page.find_all('iframe', class_='pdf')
 
35
  os.system(cmd_for_curl)
36
  return pdf_name
37
  else:
38
+ raise Exception("Unexpected number of PDFs (=/= 1)!")