File size: 3,787 Bytes
16739be 0c4db8b 16739be 0c4db8b 16739be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
def scrape_news(keyword):
# λ€μ΄λ² λ΄μ€ κ²μ URL ꡬμ±
base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
url = base_url + keyword
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
}
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, "html.parser")
news_list = []
# λ΄μ€ μμ΄ν
μ div ν΄λμ€ "news_area" μμ μμ
news_areas = soup.find_all("div", class_="news_area")
for area in news_areas:
try:
# μ λ¬Έμ¬μ λ°νμΌμ news_info > info_group μμ μΆμΆ
info_group = area.find("div", class_="news_info").find("div", class_="info_group")
publisher_tag = info_group.find("a", class_="info press")
publisher = publisher_tag.get_text(strip=True) if publisher_tag else ""
date_tag = info_group.find("span", class_="info")
date = date_tag.get_text(strip=True) if date_tag else ""
# μ λͺ©κ³Ό λ§ν¬λ news_contents λ΄μ news_titμμ μΆμΆ
title_tag = area.find("a", class_="news_tit")
title = title_tag.get("title", title_tag.get_text(strip=True)) if title_tag else ""
link = title_tag.get("href") if title_tag else ""
# λ΄μ€ κ°λ΅μ 보λ api_txt_lines dsc_txt_wrapμμ μΆμΆ
brief_tag = area.find("a", class_="api_txt_lines dsc_txt_wrap")
brief = brief_tag.get_text(strip=True) if brief_tag else ""
news_list.append({
"μ λ¬Έμ¬": publisher,
"λ°νμΌ": date,
"μ λͺ©": title,
"λ΄μ€κ°λ΅μ 보": brief,
"λ§ν¬": link
})
except Exception as e:
# μ€λ₯ λ°μ μ ν΄λΉ λ΄μ€ νλͺ©μ 건λλλλ€.
continue
# κ²°κ³Ό DataFrame μμ±
df = pd.DataFrame(news_list)
# UI μΆλ ₯μ© HTML ν
μ΄λΈ μμ± (λ§ν¬λ ν΄λ¦ κ°λ₯νλλ‘ a νκ·Έ μ²λ¦¬)
if df.empty:
html_table = "<p>κ²μ κ²°κ³Όκ° μμ΅λλ€.</p>"
else:
df["λ§ν¬"] = df["λ§ν¬"].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
html_table = df.to_html(escape=False, index=False)
# Excel νμΌ μμ± (μμ
λ΄ λ§ν¬λ HYPERLINK ν¨μλ‘ ν΄λ¦ κ°λ₯νλλ‘ μ²λ¦¬)
df_excel = pd.DataFrame(news_list)
if not df_excel.empty:
df_excel["λ§ν¬"] = df_excel["λ§ν¬"].apply(lambda x: f'=HYPERLINK("{x}", "λ§ν¬")')
# μμ νμΌμ μμ±νμ¬ μμ
νμΌ μ μ₯ (gr.File μ»΄ν¬λνΈλ νμΌ κ²½λ‘λ₯Ό νμλ‘ ν¨)
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
tmp_path = tmp.name
with pd.ExcelWriter(tmp_path, engine="openpyxl") as writer:
df_excel.to_excel(writer, index=False, sheet_name="News")
return html_table, tmp_path
with gr.Blocks() as demo:
gr.Markdown("# λ€μ΄λ² λ΄μ€ μ€ν¬λν")
gr.Markdown("μ
λ ₯ν κ²μμ΄λ₯Ό κΈ°λ°μΌλ‘ λ€μ΄λ² λ΄μ€ μ 보λ₯Ό μ€ν¬λννκ³ HTML νμ μμ
λ€μ΄λ‘λλ₯Ό μ 곡ν©λλ€.")
# κ° κΈ°λ₯μ ν μ€μ© μμλλ‘ λ°°μΉ
keyword_input = gr.Textbox(label="κ²μμ΄", placeholder="λ΄μ€ κ²μμ΄λ₯Ό μ
λ ₯νμΈμ.")
search_button = gr.Button("κ²μ")
news_html = gr.HTML(label="λ΄μ€ κ²°κ³Ό")
excel_file = gr.File(label="μμ
λ€μ΄λ‘λ")
search_button.click(fn=scrape_news, inputs=keyword_input, outputs=[news_html, excel_file])
if __name__ == "__main__":
demo.launch() |