Spaces:
Running
Running
| import gradio as gr | |
| from bs4 import BeautifulSoup as bs | |
| from pypdf import PdfReader | |
| from pathlib import Path | |
| #import html5lib | |
| #import copy | |
| import requests | |
| #from IPython.display import IFrame | |
| def scrape(instring): | |
| response = requests.get(instring, stream=True) | |
| if response.status_code == 200: | |
| with open("data.pdf", "wb") as f: | |
| f.write(response.content) | |
| else: | |
| print(response.status_code) | |
| out = Path("data.pdf") | |
| print (out) | |
| reader = PdfReader("data.pdf") | |
| number_of_pages = len(reader.pages) | |
| page = reader.pages[0] | |
| text = page.extract_text() | |
| return gr.HTML.update(f'''<embed src={out} type="application/pdf" width="100%" height="500px" />''') | |
| def scrape1(instring): | |
| # set the url to perform the get request | |
| URL = f'{instring}' | |
| page = requests.get(URL) | |
| # load the page content | |
| text = page.content | |
| # make a soup object by using beautiful | |
| # soup and set the markup as html parser | |
| soup = bs(text, "html.parser") | |
| out = str(soup.prettify()) | |
| return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''') | |
| def scrape0(instring): | |
| #r = requests.get(instring) | |
| chunk_size=2000 | |
| url = f'{instring}' | |
| r = requests.get(url, stream=True) | |
| html_content = requests.get(url).text | |
| soup = bs(html_content,"html.parser") | |
| with open('metadata.pdf', 'wb') as fd: | |
| for chunk in r.iter_content(chunk_size): | |
| fd.write(chunk) | |
| try: | |
| out = r.content | |
| except Exception: | |
| #out=copy.copy(soup) | |
| print ("No Divs") | |
| #out = IFrame(src={instring}, width=700, height=600) | |
| #return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''') | |
| return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''') | |
| with gr.Blocks() as app: | |
| inp=gr.Textbox() | |
| go_btn = gr.Button() | |
| outp = gr.HTML() | |
| go_btn.click(scrape,inp,outp) | |
| app.queue(concurrency_count=10).launch() |