urlcrawl / app.py
springwater's picture
Update app.py
85e6b2a verified
raw
history blame contribute delete
No virus
958 Bytes
import gradio as gr
import re
import requests
from bs4 import BeautifulSoup
def extract_pdf_links():
url = 'https://finance.naver.com/research/company_list.naver'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pdf_links = []
for link in soup.find_all('a', href=True):
if re.search(r'\.pdf', link['href']):
pdf_links.append(link['href'])
return pdf_links[:100]
def generate_html(pdf_links):
html = ""
for link in pdf_links:
html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'
return html
def extract_and_download():
pdf_links = extract_pdf_links()
return generate_html(pdf_links)
title = "네이버 종목별 증권리포트 최근 30개를 바로 확인하세요."
iface = gr.Interface(extract_and_download,
inputs=[],
outputs="html",
title=title)
iface.launch()