File size: 5,198 Bytes
7c2bd05 af774df 7c2bd05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
import torch
import pandas as pd
import logging
import re
import faiss
import numpy as np
import time
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from datasets import load_dataset
from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed
from tqdm import tqdm
from keybert import KeyBERT # β
KeyBERT μΆκ°
# β
λ‘κ·Έ μ€μ
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
if HF_API_TOKEN:
try:
logger.info("π Hugging Faceμ λ‘κ·ΈμΈ μ€...")
login(token=HF_API_TOKEN)
logger.info("β
Hugging Face λ‘κ·ΈμΈ μ±κ³΅!")
except Exception as e:
logger.error(f"β Hugging Face λ‘κ·ΈμΈ μ€ν¨: {e}")
sys.exit("π« νλ‘κ·Έλ¨μ μ’
λ£ν©λλ€. μ ν¨ν HF_API_TOKENμ΄ νμν©λλ€.")
else:
logger.error("β νκ²½ λ³μ 'HF_API_TOKEN'μ΄ μ€μ λμ§ μμμ΅λλ€.")
sys.exit("π« νλ‘κ·Έλ¨μ μ’
λ£ν©λλ€. HF_API_TOKEN νκ²½ λ³μλ₯Ό μ€μ ν΄ μ£ΌμΈμ.")
# β
FastAPI μΈμ€ν΄μ€ μμ±
app = FastAPI(title="π KeyBERT κΈ°λ° FAISS κ²μ API", version="1.0")
# β
KeyBERT λͺ¨λΈ λ‘λ
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
keyword_model = KeyBERT(embedding_model) # β
KeyBERT μ μ©
logger.info("β
KeyBERT κΈ°λ° ν€μλ μΆμΆ λͺ¨λΈ λ‘λ μλ£!")
# β
μ§ν μ€μΈ κ²½λ§€ μν λ°μ΄ν° λ‘λ
def load_huggingface_jsonl(dataset_name, split="train"):
try:
repo_id = f"aikobay/{dataset_name}"
dataset = load_dataset(repo_id, split=split)
df = dataset.to_pandas().dropna()
return df
except Exception as e:
logger.error(f"β λ°μ΄ν° λ‘λ μ€ μ€λ₯ λ°μ: {e}")
return pd.DataFrame()
try:
active_sale_items = load_huggingface_jsonl("initial_saleitem_dataset")
logger.info(f"β
μ§ν μ€μΈ κ²½λ§€ μν λ°μ΄ν° λ‘λ μλ£! μ΄ {len(active_sale_items)}κ° μν")
except Exception as e:
logger.error(f"β μν λ°μ΄ν° λ‘λ μ€ μ€λ₯ λ°μ: {e}")
active_sale_items = pd.DataFrame()
# β
FAISS μΈλ±μ€ μ΄κΈ°ν
faiss_index = faiss.IndexFlatL2(384)
indexed_items = []
# β
FAISS μΈλ±μ€ ꡬμΆ
def rebuild_faiss_index():
global faiss_index, indexed_items, active_sale_items
logger.info("π μλ‘μ΄ sale_item λ°μ΄ν°λ‘ FAISS μΈλ±μ€λ₯Ό μ¬κ΅¬μΆν©λλ€...")
active_sale_items = load_huggingface_jsonl("initial_saleitem_dataset")
item_names = active_sale_items["ITEMNAME"].tolist()
indexed_items = item_names
logger.info(f"πΉ μ΄ {len(item_names)}κ° μν 벑ν°ν μμ...")
item_vectors = embedding_model.encode(item_names, convert_to_numpy=True).astype("float32")
faiss_index = faiss.IndexFlatL2(item_vectors.shape[1])
faiss_index.add(item_vectors)
logger.info(f"β
FAISS μΈλ±μ€κ° {len(indexed_items)}κ° μνμΌλ‘ μλ‘κ² κ΅¬μΆλμμ΅λλ€.")
# β
KeyBERT κΈ°λ° ν€μλ μΆμΆ ν¨μ
def generate_similar_keywords(query: str, num_keywords: int = 5):
"""KeyBERT λͺ¨λΈμ μ΄μ©ν΄ κ²μμ΄μ μ μ¬ ν€μλ μΆμΆ"""
try:
keywords = keyword_model.extract_keywords(query, keyphrase_ngram_range=(1,2), stop_words=None, top_n=num_keywords)
keywords = [kw[0] for kw in keywords]
logger.info(f"π μμ±λ μ μ¬ ν€μλ: {keywords}")
return keywords
except Exception as e:
logger.error(f"β KeyBERT ν€μλ μΆμΆ μ€ μ€λ₯ λ°μ: {e}")
return [query]
# β
FAISS κ²μ ν¨μ (μ μ¬ ν€μλ κΈ°λ° κ²μ)
def search_faiss_with_keywords(query: str, top_k: int = 5):
start_time = time.time()
keywords = generate_similar_keywords(query)
keyword_vectors = embedding_model.encode(keywords, convert_to_numpy=True).astype("float32")
all_results = []
for vec in keyword_vectors:
_, indices = faiss_index.search(np.array([vec]), top_k)
all_results.extend(indices[0])
unique_results = list(set(all_results))
recommendations = [indexed_items[i] for i in unique_results]
end_time = time.time()
logger.info(f"π κ²μ μν μλ£! κ±Έλ¦° μκ°: {end_time - start_time:.4f}μ΄")
return recommendations
# β
API μμ² λͺ¨λΈ
class RecommendRequest(BaseModel):
search_query: str
top_k: int = 5
# β
μΆμ² API μλν¬μΈνΈ
@app.post("/api/recommend")
async def recommend(request: RecommendRequest):
try:
recommendations = search_faiss_with_keywords(request.search_query, request.top_k)
return {"query": request.search_query, "recommendations": recommendations}
except Exception as e:
raise HTTPException(status_code=500, detail=f"μΆμ² μ€λ₯: {str(e)}")
# β
FAISS μΈλ±μ€ κ°±μ API
@app.post("/api/update_index")
async def update_index():
rebuild_faiss_index()
return {"message": "β
FAISS μΈλ±μ€ μ
λ°μ΄νΈ μλ£!"}
# β
FastAPI μ€ν
if __name__ == "__main__":
rebuild_faiss_index()
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|