| | import gradio as gr |
| | import trafilatura |
| | import requests |
| | from markdownify import markdownify as md |
| |
|
| | def extract(url): |
| | headers = {"User-Agent": "Mozilla/5.0"} |
| | try: |
| | r = requests.get(url, headers=headers, timeout=10) |
| | r.raise_for_status() |
| | |
| | html_content = trafilatura.extract( |
| | r.text, |
| | output_format="html", |
| | include_tables=True, |
| | favor_recall=True |
| | ) |
| | if html_content: |
| | |
| | markdown_text = md(html_content, heading_style="ATX") |
| | return markdown_text |
| | return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค." |
| | except requests.exceptions.Timeout: |
| | return "์์ฒญ์ด ์๊ฐ ์ด๊ณผ๋์์ต๋๋ค." |
| | except requests.exceptions.RequestException as e: |
| | return f"์์ฒญ ์คํจ: {e}" |
| | except Exception as e: |
| | return f"์๋ฌ ๋ฐ์: {e}" |
| |
|
| | iface = gr.Interface( |
| | fn=extract, |
| | inputs=gr.Textbox(label="URL ์
๋ ฅ", placeholder="https://example.com"), |
| | outputs=gr.Markdown(label="์ถ์ถ๋ ๋ณธ๋ฌธ"), |
| | title="๋ณธ๋ฌธ ์ถ์ถ๊ธฐ (๋งํฌ๋ค์ด ์ง์)", |
| | description="์นํ์ด์ง URL์ ์
๋ ฅํ๋ฉด ๋ฆฌ๋๋ชจ๋์ฒ๋ผ ๊น๋ํ๊ฒ ๋งํฌ๋ค์ด์ผ๋ก ์ถ์ถํฉ๋๋ค." |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | iface.launch() |