|
import os |
|
import requests |
|
from requests.adapters import HTTPAdapter |
|
from requests.packages.urllib3.util.retry import Retry |
|
from bs4 import BeautifulSoup |
|
import gradio as gr |
|
import datetime |
|
import pandas as pd |
|
import xlsxwriter |
|
import logging |
|
import time |
|
import random |
|
from datetime import datetime |
|
import pytz |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
class ProxyConfig: |
|
def __init__(self): |
|
|
|
self.proxy_base = { |
|
"username": os.environ.get("PROXY_USERNAME"), |
|
"password": os.environ.get("PROXY_PASSWORD"), |
|
"host": os.environ.get("PROXY_HOST"), |
|
"ports": { |
|
"http": os.environ.get("PROXY_HTTP_PORT"), |
|
"socks5": os.environ.get("PROXY_SOCKS5_PORT") |
|
} |
|
} |
|
|
|
def get_proxy_config(self, use_socks=False): |
|
"""νλ‘μ μ€μ μμ±""" |
|
try: |
|
username = self.proxy_base["username"] |
|
password = self.proxy_base["password"] |
|
host = self.proxy_base["host"] |
|
port = self.proxy_base["ports"]["socks5" if use_socks else "http"] |
|
|
|
proxy_auth = f"{username}__cr.kr" |
|
protocol = "socks5" if use_socks else "http" |
|
proxy_url = f"{protocol}://{proxy_auth}:{password}@{host}:{port}" |
|
|
|
logger.info(f"[PROXY] Configuration created: {protocol}://{host}:{port}") |
|
|
|
return { |
|
protocol: proxy_url |
|
} |
|
except Exception as e: |
|
logger.error(f"[PROXY] Configuration failed: {str(e)}") |
|
return None |
|
|
|
def setup_session(): |
|
"""λ κ°νλ μΈμ
μ€μ """ |
|
session = requests.Session() |
|
|
|
|
|
proxy_config = ProxyConfig() |
|
proxies = proxy_config.get_proxy_config(use_socks=False) |
|
if proxies: |
|
session.proxies.update(proxies) |
|
try: |
|
|
|
ip_response = session.get('https://api.ipify.org?format=json', timeout=10) |
|
if ip_response.status_code == 200: |
|
logger.info(f"[PROXY] Current IP: {ip_response.json().get('ip')}") |
|
else: |
|
logger.warning(f"[PROXY] Failed to get IP. Status code: {ip_response.status_code}") |
|
except Exception as e: |
|
logger.error(f"[PROXY] IP check failed: {str(e)}") |
|
else: |
|
logger.warning("[PROXY] No proxy configuration available") |
|
|
|
|
|
retries = Retry( |
|
total=5, |
|
backoff_factor=1, |
|
status_forcelist=[500, 502, 503, 504], |
|
allowed_methods=["GET", "HEAD", "OPTIONS"] |
|
) |
|
|
|
|
|
session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', |
|
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Connection': 'keep-alive', |
|
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', |
|
'Sec-Ch-Ua-Mobile': '?0', |
|
'Sec-Ch-Ua-Platform': '"Windows"', |
|
'Sec-Fetch-Dest': 'document', |
|
'Sec-Fetch-Mode': 'navigate', |
|
'Sec-Fetch-Site': 'none', |
|
'Sec-Fetch-User': '?1', |
|
'Upgrade-Insecure-Requests': '1', |
|
'Cache-Control': 'max-age=0', |
|
'DNT': '1' |
|
}) |
|
|
|
|
|
adapter = HTTPAdapter( |
|
max_retries=retries, |
|
pool_connections=100, |
|
pool_maxsize=100 |
|
) |
|
session.mount('https://', adapter) |
|
session.mount('http://', adapter) |
|
|
|
return session |
|
|
|
def get_base_url(board_select): |
|
urls = { |
|
"λ§μ΄λ² λ² ": "https://cafe.naver.com/ArticleList.nhn?search.clubid=29434212&search.menuid=2&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=29434212", |
|
"λ§μ€νλ¦": "https://cafe.naver.com/ArticleList.nhn?search.clubid=10094499&search.menuid=599&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=10094499", |
|
"κ΄μ£Όλ§": "https://cafe.naver.com/ArticleList.nhn?search.clubid=26025763&search.menuid=508&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=26025763", |
|
"μΌνμ§λ¦μ ": "https://cafe.naver.com/ArticleList.nhn?search.clubid=25729954&search.menuid=751&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=25729954", |
|
"λΆμ°λ§": "https://cafe.naver.com/ArticleList.nhn?search.clubid=28707025&search.menuid=282&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=28707025", |
|
"μ§ν¬λ§": "https://cafe.naver.com/ArticleList.nhn?search.clubid=21442290&search.menuid=476&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=21442290" |
|
} |
|
selected_url = urls.get(board_select) |
|
if not selected_url: |
|
logger.warning(f"Invalid board selected: {board_select}") |
|
return "Invalid board selected" |
|
return selected_url |
|
|
|
def convert_views(view_string): |
|
if 'λ§' in view_string: |
|
number_part = view_string.replace('λ§', '') |
|
return int(float(number_part) * 10000) |
|
return int(view_string.replace(",", "")) |
|
|
|
def validate_row_data(row_data): |
|
"""ν λ°μ΄ν°μ μ ν¨μ± κ²μ¬""" |
|
required_fields = ['td_view', 'td_likes', 'td_date'] |
|
for field in required_fields: |
|
if not row_data.find('td', class_=field): |
|
return False |
|
return True |
|
|
|
def extract_data_to_excel_and_html(page, board_select, custom_url=""): |
|
try: |
|
if not isinstance(page, (int, float)) or page < 1 or page > 50: |
|
return None, "<p>νμ΄μ§ μλ 1-50 μ¬μ΄μ¬μΌ ν©λλ€.</p>" |
|
|
|
session = setup_session() |
|
|
|
if board_select == "μ§μ μ
λ ₯" and custom_url.strip(): |
|
|
|
url_input = custom_url.strip() |
|
|
|
if url_input.startswith("https://cafe.naver.com/f-e/cafes/25729954/menus/186"): |
|
url_input = "https://cafe.naver.com/ArticleList.nhn?search.clubid=25729954&search.menuid=751&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=25729954" |
|
base_url = url_input |
|
filename = f'custom_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx' |
|
else: |
|
base_url = get_base_url(board_select) |
|
if base_url == "Invalid board selected": |
|
return "Invalid board selected", "" |
|
filename = f'{board_select}_{datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d_%H%M%S")}.xlsx' |
|
|
|
workbook = xlsxwriter.Workbook(filename) |
|
worksheet = workbook.add_worksheet() |
|
|
|
|
|
header_format = workbook.add_format({ |
|
'bold': True, |
|
'align': 'center', |
|
'valign': 'vcenter', |
|
'bg_color': '#f8f9fa', |
|
'border': 1 |
|
}) |
|
|
|
link_format = workbook.add_format({ |
|
'align': 'left', |
|
'valign': 'vcenter', |
|
'border': 1, |
|
'font_color': '#0066cc', |
|
'underline': True |
|
}) |
|
|
|
date_format = workbook.add_format({ |
|
'align': 'center', |
|
'valign': 'vcenter', |
|
'border': 1 |
|
}) |
|
|
|
number_format = workbook.add_format({ |
|
'align': 'center', |
|
'valign': 'vcenter', |
|
'border': 1, |
|
'num_format': '#,##0' |
|
}) |
|
|
|
|
|
headers = ['μ λͺ©', 'μμ±μΌ', 'μ‘°νμ', 'μ’μμ', 'λκΈμ'] |
|
for col, header in enumerate(headers): |
|
worksheet.write(0, col, header, header_format) |
|
|
|
worksheet.autofilter(0, 0, 0, len(headers) - 1) |
|
|
|
|
|
html_output = """ |
|
<style> |
|
.crawl-table { |
|
width: 100%; |
|
border-collapse: collapse; |
|
margin: 10px 0; |
|
font-family: 'Pretendard', -apple-system, BlinkMacSystemFont, system-ui, Roboto, sans-serif; |
|
} |
|
.crawl-table thead th, |
|
.crawl-table tr:first-child th { |
|
background-color: #000000; |
|
color: #ffffff; |
|
border: 1px solid #dee2e6; |
|
padding: 12px 8px; |
|
font-weight: 600; |
|
vertical-align: middle; |
|
text-align: center !important; |
|
} |
|
.crawl-table td { |
|
border: 1px solid #dee2e6; |
|
padding: 10px 8px; |
|
line-height: 1.4; |
|
} |
|
.crawl-table td:first-child { |
|
text-align: left; |
|
} |
|
.crawl-table td:nth-child(2), |
|
.crawl-table td:nth-child(3), |
|
.crawl-table td:nth-child(4), |
|
.crawl-table td:nth-child(5) { |
|
text-align: right; |
|
} |
|
.crawl-table td:first-child a { |
|
text-decoration: none; |
|
color: #0066cc; |
|
} |
|
.crawl-table tr:nth-child(even) { |
|
background-color: #f8f9fa; |
|
} |
|
.crawl-table tr:hover { |
|
background-color: #f0f0f0; |
|
} |
|
@media (max-width: 768px) { |
|
.crawl-table { |
|
font-size: 14px; |
|
} |
|
.crawl-table th, |
|
.crawl-table td { |
|
padding: 8px 4px; |
|
} |
|
} |
|
</style> |
|
<table class="crawl-table"> |
|
<thead> |
|
<tr> |
|
<th>μ λͺ©</th> |
|
<th>μμ±μΌ</th> |
|
<th>μ‘°νμ</th> |
|
<th>μ’μμ</th> |
|
<th>λκΈμ</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
""" |
|
row = 1 |
|
current_date = datetime.now().strftime("%Y.%m.%d") |
|
|
|
for p in range(1, page + 1): |
|
try: |
|
url = f"{base_url}&search.page={p}" |
|
logger.info(f"[CRAWL] Fetching page {p}: {url}") |
|
response = session.get(url) |
|
time.sleep(random.uniform(0.5, 1.0)) |
|
|
|
if response.status_code != 200: |
|
logger.error(f"[CRAWL] Failed to fetch page {p}. Status code: {response.status_code}") |
|
continue |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
article_boards = soup.find_all('div', class_='article-board m-tcol-c') |
|
|
|
if len(article_boards) < 2: |
|
logger.warning(f"[CRAWL] No article boards found on page {p}") |
|
continue |
|
|
|
article_board = article_boards[1] |
|
rows = article_board.find_all('tr') |
|
logger.info(f"[CRAWL] Found {len(rows)} rows on page {p}") |
|
|
|
for row_data in rows: |
|
try: |
|
if not validate_row_data(row_data): |
|
continue |
|
|
|
a_tag = row_data.find('a', class_='article') |
|
if not a_tag: |
|
continue |
|
|
|
link = a_tag['href'] |
|
title = a_tag.get_text(strip=True) |
|
full_link = f"https://cafe.naver.com{link}" |
|
|
|
views = convert_views(row_data.find('td', class_='td_view').get_text(strip=True)) |
|
likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", "")) |
|
date = row_data.find('td', class_='td_date').get_text(strip=True) |
|
|
|
comment_tag = row_data.find('a', class_='cmt') |
|
comments = 0 |
|
if comment_tag and comment_tag.find('em'): |
|
comments = int(comment_tag.find('em').get_text(strip=True)) |
|
|
|
if ":" in date: |
|
date = current_date |
|
|
|
worksheet.write_url(row, 0, full_link, link_format, title) |
|
worksheet.write(row, 1, date, date_format) |
|
worksheet.write_number(row, 2, views, number_format) |
|
worksheet.write_number(row, 3, likes, number_format) |
|
worksheet.write_number(row, 4, comments, number_format) |
|
|
|
html_output += f""" <tr> |
|
<td><a href='{full_link}' target='_blank'>{title}</a></td> |
|
<td>{date}</td> |
|
<td>{views:,}</td> |
|
<td>{likes:,}</td> |
|
<td>{comments:,}</td> |
|
</tr> |
|
""" |
|
row += 1 |
|
|
|
except AttributeError as e: |
|
logger.warning(f"[CRAWL] Row parsing error: {str(e)}") |
|
continue |
|
|
|
except Exception as e: |
|
logger.error(f"[CRAWL] Page {p} crawling error: {str(e)}") |
|
continue |
|
|
|
worksheet.set_column(0, 0, 50) |
|
worksheet.set_column(1, 1, 12) |
|
worksheet.set_column(2, 2, 10) |
|
worksheet.set_column(3, 3, 10) |
|
worksheet.set_column(4, 4, 10) |
|
|
|
workbook.close() |
|
html_output += """ </tbody> |
|
</table>""" |
|
|
|
return filename, html_output |
|
|
|
except Exception as e: |
|
error_message = f"λ°μ΄ν° μμ§ μ€ μ€λ₯κ° λ°μνμ΅λλ€. μ μ ν λ€μ μλν΄ μ£ΌμΈμ. (μλ¬: {str(e)})" |
|
logger.error(f"[CRAWL] μ 체 ν¬λ‘€λ§ μ€ν¨: {str(e)}") |
|
return None, f"<p style='color: #dc3545; padding: 10px; background-color: #f8d7da; border-radius: 4px;'>{error_message}</p>" |
|
|
|
def crawl_with_progress(board, pages, custom_url): |
|
try: |
|
excel_file, html_output = extract_data_to_excel_and_html(pages, board, custom_url) |
|
if excel_file: |
|
return excel_file, html_output, "μμ§ μλ£" |
|
else: |
|
return None, "", "μμ§ μ€ν¨" |
|
except Exception as e: |
|
return None, "", f"μ€λ₯ λ°μ: {str(e)}" |
|
|
|
def update_custom_url_visibility(selected): |
|
|
|
if selected == "μ§μ μ
λ ₯": |
|
return gr.update(visible=True) |
|
else: |
|
return gr.update(visible=False) |
|
|
|
css = """ |
|
/* μ 체 컨ν
μ΄λ μ€νμΌλ§ */ |
|
.gradio-container { |
|
font-family: 'Pretendard', -apple-system, BlinkMacSystemFont, system-ui, Roboto, sans-serif !important; |
|
max-width: 1000px !important; |
|
margin: 2rem auto !important; |
|
padding: 2rem !important; |
|
background-color: #ffffff !important; |
|
box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24) !important; |
|
border-radius: 12px !important; |
|
} |
|
/* μ λͺ© μ€νμΌλ§ */ |
|
h1 { |
|
font-size: 2.2rem !important; |
|
font-weight: 700 !important; |
|
color: #000000 !important; |
|
text-align: center !important; |
|
margin-bottom: 2rem !important; |
|
padding-bottom: 1.5rem !important; |
|
border-bottom: 2px solid #000000 !important; |
|
} |
|
/* μ€λͺ
ν
μ€νΈ μ€νμΌλ§ */ |
|
.gr-markdown { |
|
text-align: center !important; |
|
color: #666666 !important; |
|
font-size: 1rem !important; |
|
margin-bottom: 2rem !important; |
|
} |
|
/* λΌλμ€ λ²νΌ κ·Έλ£Ή μ€νμΌλ§ */ |
|
.gr-form { |
|
background-color: #f8f8f8 !important; |
|
padding: 1.5rem !important; |
|
border-radius: 8px !important; |
|
margin-bottom: 1.5rem !important; |
|
} |
|
.gr-radio-row { |
|
display: grid !important; |
|
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)) !important; |
|
gap: 1rem !important; |
|
padding: 1rem !important; |
|
} |
|
.gr-radio { |
|
border: 2px solid #000000 !important; |
|
padding: 0.8rem !important; |
|
border-radius: 6px !important; |
|
transition: all 0.3s ease !important; |
|
} |
|
.gr-radio:checked { |
|
background-color: #000000 !important; |
|
color: #ffffff !important; |
|
} |
|
/* μ«μ μ
λ ₯ νλ μ€νμΌλ§ */ |
|
.gr-number-input { |
|
border: 2px solid #000000 !important; |
|
border-radius: 6px !important; |
|
padding: 0.8rem !important; |
|
font-size: 1rem !important; |
|
width: 100% !important; |
|
max-width: 300px !important; |
|
margin: 0 auto !important; |
|
} |
|
/* μν ν
μ€νΈλ°μ€ μ€νμΌλ§ */ |
|
.gr-textbox { |
|
background-color: #f8f8f8 !important; |
|
border: 1px solid #e0e0e0 !important; |
|
border-radius: 6px !important; |
|
padding: 1rem !important; |
|
margin: 1rem 0 !important; |
|
font-size: 0.95rem !important; |
|
} |
|
/* μμ§ λ²νΌ μ€νμΌλ§ */ |
|
.gr-button { |
|
background-color: #000000 !important; |
|
color: #ffffff !important; |
|
padding: 1rem 2rem !important; |
|
border-radius: 6px !important; |
|
font-weight: 600 !important; |
|
font-size: 1.1rem !important; |
|
border: none !important; |
|
width: 100% !important; |
|
max-width: 300px !important; |
|
margin: 1.5rem auto !important; |
|
display: block !important; |
|
transition: all 0.3s ease !important; |
|
} |
|
.gr-button:hover { |
|
background-color: #333333 !important; |
|
transform: translateY(-2px) !important; |
|
box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important; |
|
} |
|
/* νμΌ λ€μ΄λ‘λ μμ μ€νμΌλ§ */ |
|
.gr-file { |
|
border: 2px dashed #000000 !important; |
|
border-radius: 8px !important; |
|
padding: 2rem !important; |
|
text-align: center !important; |
|
background-color: #f8f8f8 !important; |
|
margin-top: 2rem !important; |
|
} |
|
/* HTML κ²°κ³Ό ν
μ΄λΈ μ€νμΌλ§ */ |
|
table { |
|
width: 100% !important; |
|
border-collapse: collapse !important; |
|
margin-top: 1.5rem !important; |
|
border-radius: 8px !important; |
|
overflow: hidden !important; |
|
box-shadow: 0 1px 3px rgba(0,0,0,0.12) !important; |
|
} |
|
th { |
|
background-color: #000000 !important; |
|
color: #ffffff !important; |
|
padding: 1rem !important; |
|
text-align: center !important; |
|
font-weight: 600 !important; |
|
} |
|
td { |
|
padding: 0.8rem !important; |
|
border-bottom: 1px solid #e0e0e0 !important; |
|
color: #333333 !important; |
|
} |
|
tr:hover { |
|
background-color: #f5f5f5 !important; |
|
} |
|
/* λ°μν λμμΈ */ |
|
@media (max-width: 768px) { |
|
.gradio-container { |
|
padding: 1rem !important; |
|
margin: 1rem !important; |
|
} |
|
|
|
h1 { |
|
font-size: 1.8rem !important; |
|
} |
|
|
|
.gr-radio-row { |
|
grid-template-columns: 1fr !important; |
|
} |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.Markdown("# Nμ¬ Cafe ν«λ κ²μν ν¬λ‘€λ§") |
|
gr.Markdown(""" |
|
νμ΄μ§ μλ₯Ό μ
λ ₯νκ±°λ, κ²μν μ ν μ 'μ§μ μ
λ ₯'μ μ ννλ©΄ μ§μ URLμ μ
λ ₯ν μ μμ΅λλ€. |
|
(μ΅λ νμ΄μ§μλ 50νμ΄μ§ μ
λλ€.) |
|
""") |
|
|
|
with gr.Row(): |
|
board_select = gr.Radio( |
|
choices=["λ§μ΄λ² λ² ", "λ§μ€νλ¦", "κ΄μ£Όλ§", "μΌνμ§λ¦μ ", "λΆμ°λ§", "μ§ν¬λ§", "μ§μ μ
λ ₯"], |
|
label="κ²μνμ μ ννμΈμ", |
|
container=True |
|
) |
|
|
|
with gr.Row(): |
|
inp = gr.Number( |
|
label="μμ§ν νμ΄μ§ μ (μ΅λ 50νμ΄μ§)", |
|
value=1, |
|
minimum=1, |
|
maximum=50, |
|
container=True |
|
) |
|
|
|
with gr.Row(): |
|
custom_url = gr.Textbox( |
|
label="μ§μ λ§ν¬ μ
λ ₯ (μ΅μ
)", |
|
placeholder="μ: https://cafe.naver.com/ArticleList.nhn?...", |
|
visible=False, |
|
container=True |
|
) |
|
|
|
|
|
board_select.change(fn=update_custom_url_visibility, inputs=board_select, outputs=custom_url) |
|
|
|
status = gr.Textbox( |
|
label="μν", |
|
value="λκΈ° μ€...", |
|
container=True |
|
) |
|
|
|
btn = gr.Button("μμ§νκΈ°", variant="primary") |
|
|
|
output_file = gr.File(label="μμ
νμΌ λ€μ΄λ‘λ") |
|
output_html = gr.HTML() |
|
|
|
btn.click( |
|
fn=crawl_with_progress, |
|
inputs=[board_select, inp, custom_url], |
|
outputs=[output_file, output_html, status] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|