Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import requests | |
import cv2 | |
import mediapipe as mp | |
import torch | |
from PIL import Image | |
from io import BytesIO | |
from joblib import load | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from youtube_transcript_api import YouTubeTranscriptApi | |
# ๋ชจ๋ธ ๋ฐ ๊ธฐํ ํ์ผ ๋ก๋ | |
model = load('view_predictor.joblib') | |
_, _, le_cat = load('label_encoders.joblib') | |
feature_cols = load('features.joblib') | |
# ๊ฐ์ฑ ๋ถ์ ๋ชจ๋ธ | |
senti_model_name = "nlp04/korean_sentiment_analysis_kcelectra" | |
senti_tokenizer = AutoTokenizer.from_pretrained(senti_model_name) | |
senti_model = AutoModelForSequenceClassification.from_pretrained(senti_model_name) | |
senti_model.eval() | |
def sentiment_score(text): | |
if not text or pd.isna(text): | |
return 0.0 | |
with torch.no_grad(): | |
inputs = senti_tokenizer(text, return_tensors="pt", truncation=True) | |
outputs = senti_model(**inputs) | |
probs = torch.softmax(outputs.logits, dim=1).squeeze() | |
try: | |
return round(float(probs[2]) * 100, 1) # Positive | |
except IndexError: | |
return round(float(probs[1]) * 100, 1) | |
category_dict = { | |
'์์': ['์ฏ์', '์ฐจ๋ฐฅ์ด๋ผ', '๋จน๋ฐฉ', '๋จ๊ณจ', '์์นจ', '์ฅ์ฌ', '๋ง๋ค๊ธฐ', '์นผ๋ก๋ฆฌ', '๋ฒ ์ด๊ธ', '๊ณฑ์ฐฝ', '์คํ ์ดํฌ', '๊ณ ๊ธฐ', | |
'์ผ๊ฒน์ด', '์ฑ์ฌ๋น', 'ํธ์์ ', '์ด์์', '๋ผ๋ฉด', '๊น๋ฐฅ', '์นํจ', '๋ง์ง', '์ง๋ฐฅ', '๋ก๋ณถ์ด', '์์', '๊น์น', | |
'๊ด์ด', '๋ง๋', '๋๋ฉด', '์ฒ ํ', '๋ผ์ง', '์๋ฆฌ', '๊ฐ์', 'ํ์', '์ ์๋ฆฌ', '๋ ์ํผ', '๊น์น์ฐ๊ฐ'], | |
'์ฐ์/์ ๋ช ์ธ': ['์ตํ์ ', '์ดํด๋ฆฌ', '๊ฐ๊ทธ๋งจ', '๊ฐ๋ฏผ๊ฒฝ', '๋ค๋น์น', '์ด์งํ', '์ฌ์', '์์ด๋', '๋ค๋์นด', '์ ๋', | |
'์ ์ฌ์', 'ํ๊ณ๊ณ ', '์กฐ์ธํธ', '์ฅ์๋', '๊น๊ตฌ๋ผ', '๊น์์ฒ ', '์ฐ์์ธ', '๋ฐฐ์ฐ', '์คํ', '์ถ์ฐ', '์ญ์ธ', | |
'๊ฐ์', '๋ ธ๋', '์ฝ์ํธ', '์ด์น์ฒ '], | |
'๊ต์ก/๊ณต๋ถ': ['์ผ์ฐจ๋ฐฉ์ ์', '์ด์ฐจ๋ฐฉ์ ์', '๋ฎ์', '์ธ์๋ถํด', '์ง์', '๋ง์ถค๋ฒ', 'ํ๊ตญ์ฌ', '๊ณผํ', '๊ณผ์ธ', '์ํ', | |
'์์ ', '๊ณต๋ถ', '์ญ์ฌ', '๊ณต๋ถ์', '์๋ฅ', 'ํด์ฆ', '์คํฐ๋', '์ ์๋', '์ํ', '์ง์', '๋ฌธ์ ', | |
'์ผ์ฐจํจ์', '์ด์ฐจํจ์', '๋ฐฉ์ ์', '๊ฒ์ ๊ณ ์', '์์ด', '๊ตญ์ด', 'ํ๊ตญ์ด', '์์ธ๋'], | |
'์ฌํ/์ฅ์': ['๋๋ฐ์ด', 'ํด๊ฐ', '์ ๊ตญ', '์ฌํ', 'ํฌ์ด', '์ธ๊ณ', '์งํ์ฒ ', 'ํ๊ฐ', '์นดํ', '์ฝ์ค', 'ํ์์ด', | |
'๋์ฟ', '๋ชฝ๊ณจ', '์ผ๋ณธ', '์ค์ฌ์นด', '์ ์ฃผ', '์ ์ฃผ', '์ ์ฃผ๋', '์์ธ', '๋ฏผ๋ฐ', '๋ฏธ๊ตญ', '๋๋ง', | |
'ํ๋ฆฌ', '์คํ์ธ', '์ธ๋ฆ๋', 'ํ์ฝฉ'], | |
'์ผ์/๊ฐ์กฑ': ['๊ฐ์กฑ', '์๋ง', '์๋น ', '๋จํธ', '์์', '๋ชจ๋ ', 'ํผ์', 'ํ๋ฃจ', '์ผ์', '์ฌ๋', '์์ด', '๊ณต์ ', | |
'ํ์ฅ', '๋ถ๋ถ', '๊ฐ์ฅ', '์ด๋จธ๋', '์กฐ์นด', '๊ฐ์', '์๋ค', '๊ฒฐํผ์'], | |
'์ฝํ ์ธ /์ ํ๋ธ': ['์๋ฅ', '์์ฆ', '๋ฆฌ๋ทฐ', '๋ผ์ด๋ธ', '๋ฐฉ์ก', '์์', '์ฑ๋', '๊ฒ์', '์ ํ๋ธ', '์๋ฐฉ์ก', | |
'์ดฌ์', '์ฝํ ์ธ ', '๋๊ธ', '์ผํ'], | |
'์ ์น': ['๋์ ', '๊ณต์ฝ', '์์ฒ ์', '๊ตญํ', '์ ์น', '๋ํต๋ น', '์ ๊ฑฐ', '์ ๋น', '์์'], | |
'๊ฒฝ์ ': ['์ฃผ์', '๋นํธ์ฝ์ธ', '์ฝ์ธ', '์ ๋ฌผ', '๋ถ์', 'ํฌ์', '๊ฒฝ์ ', '๊ธ์ต', '๊ด๊ณ ', '๋์ถ', '์ํ', '์์ฅ'], | |
'๊ฑด๊ฐ/์ด๋': ['์ด๋', '๊ฑด๊ฐ', '๋ค์ด์ดํธ', 'ํฌ์ค', '์คํธ๋ ์นญ', '์๊ฐ', '์ฒด๋ ฅ', 'ํผํธ๋์ค', '๋ฌ๋ฆฌ๊ธฐ', '๊ทผ๋ ฅ', '์๋จ'], | |
'์ธ๊ฐ๊ด๊ณ/๊ณ ๋ฏผ': ['์ฐ์ ', '๊ณ ๋ฐฑ', '์๊ฐํ ', '๋ฐ์ดํธ', '์๋ก', '๊ณ ๋ฏผ'] | |
} | |
# ์ ๋ชฉ ํค์๋ ๊ธฐ๋ฐ ๋ถ๋ฅ ํจ์ | |
def classify_by_keywords(title, keyword_dict): | |
for category, keywords in keyword_dict.items(): | |
for keyword in keywords: | |
if keyword in title: | |
return category | |
return None | |
# ์ ํ๋ธ ์นดํ ๊ณ ๋ฆฌ + ํค์๋ ๊ธฐ๋ฐ์ผ๋ก ์ฌ์ฉ์ ์นดํ ๊ณ ๋ฆฌ ๋ถ๋ฅ | |
def map_category(category_id, title, api_key): | |
# ์ ํ๋ธ ์นดํ ๊ณ ๋ฆฌ ์ด๋ฆ ๊ฐ์ ธ์ค๊ธฐ | |
url = f'https://www.googleapis.com/youtube/v3/videoCategories?part=snippet&id={category_id}®ionCode=KR&key={api_key}' | |
try: | |
res = requests.get(url).json() | |
yt_category = res['items'][0]['snippet']['title'] | |
except: | |
yt_category = "๊ธฐํ" | |
# ์ ํ๋ธ ์นดํ ๊ณ ๋ฆฌ๋ช โ ์ฌ์ฉ์ ์นดํ ๊ณ ๋ฆฌ ๋งคํ | |
category_map = { | |
"์ํ/์ ๋๋ฉ์ด์ ": "์ฝํ ์ธ /์ ํ๋ธ", | |
"์์ ": "์ฐ์/์ ๋ช ์ธ", | |
"์ํฐํ ์ธ๋จผํธ": "์ฝํ ์ธ /์ ํ๋ธ", | |
"์ฝ๋ฏธ๋": "์ฝํ ์ธ /์ ํ๋ธ", | |
"์ธ๋ฌผ/๋ธ๋ก๊ทธ": "์ฐ์/์ ๋ช ์ธ", | |
"๊ฒ์": "์ฝํ ์ธ /์ ํ๋ธ", | |
"๋ ธํ์ฐ/์คํ์ผ": "์ผ์/๊ฐ์กฑ", | |
"๋ด์ค/์ ์น": "์ ์น", | |
"๊ต์ก": "๊ต์ก/๊ณต๋ถ", | |
"๊ณผํ/๊ธฐ์ ": "๊ต์ก/๊ณต๋ถ", | |
"์คํฌ์ธ ": "๊ฑด๊ฐ/์ด๋", | |
"์๋์ฐจ": "๊ธฐํ", | |
"๋๋ฌผ": "๊ธฐํ", | |
"์ฌํ": "์ฌํ/์ฅ์" | |
} | |
mapped_category = category_map.get(yt_category, None) | |
# ํค์๋ ๊ธฐ๋ฐ ๋ณด์ ๋ถ๋ฅ | |
keyword_category = classify_by_keywords(title, category_dict) | |
# ์ต์ข ์ฐ์ ์์ ์ ์ฉ | |
return keyword_category or mapped_category or "๊ธฐํ" | |
def hue_to_color_group(hue_value): | |
if 0 <= hue_value < 15 or hue_value >= 345: | |
return "๋นจ๊ฐ ๊ณ์ด" | |
elif 15 <= hue_value < 45: | |
return "์ฃผํฉ/๋ ธ๋ ๊ณ์ด" | |
elif 45 <= hue_value < 75: | |
return "์ฐ๋/์ด๋ก ๊ณ์ด" | |
elif 75 <= hue_value < 165: | |
return "์ด๋ก/ํ๋ ๊ณ์ด" | |
elif 165 <= hue_value < 255: | |
return "ํ๋/๋จ์ ๊ณ์ด" | |
elif 255 <= hue_value < 285: | |
return "๋ณด๋ผ ๊ณ์ด" | |
elif 285 <= hue_value < 345: | |
return "๋ถํ ๊ณ์ด" | |
else: | |
return "๊ธฐํ" | |
def analyze_thumbnail(thumbnail_url): | |
response = requests.get(thumbnail_url) | |
img = Image.open(BytesIO(response.content)).convert('RGB') | |
img_np = np.array(img) | |
hsv = cv2.cvtColor(img_np, cv2.COLOR_RGB2HSV) | |
hue_avg = int(np.mean(hsv[:, :, 0]) * 2) | |
# ์ผ๊ตด ์ ๊ฒ์ถ | |
mp_face = mp.solutions.face_detection | |
with mp_face.FaceDetection(model_selection=1, min_detection_confidence=0.3) as fd: | |
results = fd.process(cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)) | |
face_count = len(results.detections) if results.detections else 0 | |
return hue_to_color_group(hue_avg), face_count, hue_avg | |
def predict_views(video_id, api_key): | |
url = f'https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id={video_id}&key={api_key}' | |
res = requests.get(url).json() | |
item = res['items'][0] | |
title = item['snippet']['title'] | |
published_at = item['snippet']['publishedAt'] | |
category_id = item['snippet'].get('categoryId', '') | |
thumbnail_url = item['snippet']['thumbnails']['high']['url'] | |
views = int(item['statistics'].get('viewCount', 0)) | |
# ๊ฒ์์ผ ์ ๋ณด | |
dt = pd.to_datetime(published_at) | |
hour = dt.hour | |
weekday = dt.dayofweek | |
# ์๋ง ์ | |
def count_manual_subtitles(video_id): | |
ppl = YouTubeTranscriptApi.list_transcripts(video_id) | |
manual = [t for t in ppl if not t.is_generated] | |
return len(manual) | |
caption_count = count_manual_subtitles(video_id) | |
# ์ธ๋ค์ผ ๋ถ์ | |
hue_group, face_count, hue_value = analyze_thumbnail(thumbnail_url) | |
# ๊ฐ์ฑ ์ ์ | |
senti = sentiment_score(title) | |
# ์นดํ ๊ณ ๋ฆฌ ์ด๋ฆ ๋งคํ | |
user_category = map_category(category_id, title, api_key) | |
# Label Encoding | |
if user_category not in le_cat.classes_: | |
user_category = '๊ธฐํ' | |
cat_encoded = le_cat.transform([user_category])[0] | |
# ์์ธก | |
X_input = pd.DataFrame([{ | |
'์๊ฐ๋': hour, | |
'์์ผ': weekday, | |
'์๋ง์': caption_count, | |
'์นดํ ๊ณ ๋ฆฌ': cat_encoded, | |
'Hue': hue_value, | |
'์ธ๋ค์ผ ์ผ๊ตด ์': face_count, | |
'๊ฐ์ฑ์ ์': senti | |
}]) | |
pred_log = model.predict(X_input[feature_cols])[0] | |
predicted_views = int(np.expm1(pred_log)) | |
return { | |
'์ ๋ชฉ': title, | |
'์์ธก ์กฐํ์': predicted_views, | |
'์ค์ ์กฐํ์': views, | |
'์นดํ ๊ณ ๋ฆฌ': user_category, | |
'์๊ฐ๋': hour, | |
'์์ผ': weekday, | |
'์๋ง์': caption_count, | |
'์ธ๋ค์ผ ์ผ๊ตด ์': face_count, | |
'๊ฐ์ฑ์ ์': senti, | |
'Hue ๊ทธ๋ฃน': hue_group, | |
'Hue ๊ฐ': hue_value, | |
'์ธ๋ค์ผ URL': thumbnail_url | |
} | |
#1. ์ถ์ธก ํจ์ | |
def extract_features_from_video_id(video_id, api_key): | |
info = predict_views(video_id, api_key) | |
return pd.DataFrame([{ | |
'์๊ฐ๋': info['์๊ฐ๋'], | |
'์์ผ': info['์์ผ'], | |
'์๋ง์': info['์๋ง์'], | |
'์นดํ ๊ณ ๋ฆฌ': le_cat.transform([info['์นดํ ๊ณ ๋ฆฌ']])[0], | |
'Hue': info['Hue ๊ฐ'], | |
'์ธ๋ค์ผ ์ผ๊ตด ์': info['์ธ๋ค์ผ ์ผ๊ตด ์'], | |
'๊ฐ์ฑ์ ์': info['๊ฐ์ฑ์ ์'] | |
}]) | |
# 2. ์์ธก ํจ์ | |
def predict_view_count(model, features): | |
pred_log = model.predict(features[feature_cols])[0] | |
return int(np.expm1(pred_log)) | |
# 3. ์๊ฐํ ํจ์ | |
def visualize_result(video_id, features, predicted_view_count, info): | |
์์ผ_ํ ์คํธ = ['์', 'ํ', '์', '๋ชฉ', '๊ธ', 'ํ ', '์ผ'][features['์์ผ'].values[0]] | |
html = f""" | |
<div style="background-color: #111; color: white; padding: 20px; border-radius: 10px; max-width: 600px; font-family: Arial, sans-serif;"> | |
<h2>๐ฏ ์์ธก ์กฐํ์: {predicted_view_count:,}ํ</h2> | |
<h3>๐ ์์ ์ ๋ชฉ: {info['์ ๋ชฉ']}</h3> | |
<img src="{info['์ธ๋ค์ผ URL']}" alt="์ธ๋ค์ผ ์ด๋ฏธ์ง" style="width: 100%; border-radius: 10px; margin-bottom: 20px;"/> | |
<ul style="list-style-type: none; padding-left: 0;"> | |
<li>๐ฝ๏ธ <strong>์์ ID:</strong> {video_id}</li> | |
<li>๐๏ธ <strong>์ค์ ์กฐํ์:</strong> {info['์ค์ ์กฐํ์']:,}ํ</li> | |
<li>โฐ <strong>์๊ฐ๋:</strong> {features['์๊ฐ๋'].values[0]}์</li> | |
<li>๐ <strong>์์ผ:</strong> {์์ผ_ํ ์คํธ}</li> | |
<li>๐ฌ <strong>์๋ง ์:</strong> {features['์๋ง์'].values[0]}</li> | |
<li>๐จ <strong>์์ ๊ณ์ด:</strong> {info['Hue ๊ทธ๋ฃน']}</li> | |
<li>๐ <strong>์ธ๋ค์ผ ์ผ๊ตด ์:</strong> {features['์ธ๋ค์ผ ์ผ๊ตด ์'].values[0]}</li> | |
<li>๐ง <strong>๊ฐ์ฑ ์ ์:</strong> {features['๊ฐ์ฑ์ ์'].values[0]:.2f}</li> | |
</ul> | |
</div> | |
""" | |
return html |