Spaces:
Sleeping
Sleeping
import os | |
os.environ["HF_HOME"] = "/tmp/huggingface" | |
from flask import Flask, request, jsonify | |
from flask_cors import CORS | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import torch | |
import numpy as np | |
import pickle | |
import json | |
import logging | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import wordpunct_tokenize | |
nltk_data_path = os.path.join(os.path.dirname(__file__), "nltk_data") | |
os.makedirs(nltk_data_path, exist_ok=True) | |
nltk.data.path.append(nltk_data_path) | |
nltk.download('stopwords', download_dir=nltk_data_path, quiet=True) | |
nltk.download('wordnet', download_dir=nltk_data_path, quiet=True) | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
app = Flask(__name__) | |
CORS(app) | |
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
INTENT_MODEL_PATH = os.path.join(BASE_DIR, "model") | |
RECOMMENDER_MODEL_PATH = os.path.join(BASE_DIR, "recommender_model") | |
intent_model = None | |
intent_tokenizer = None | |
intent_classes = None | |
intent_thresholds = None | |
recommender = None | |
recommender_model_loaded = False | |
class BookRecommender: | |
def __init__(self, model_name='all-minilm-l6-v2'): | |
self.model_name = model_name | |
self.model = None | |
self.book_embeddings = None | |
self.df = None | |
self.stop_words = set(stopwords.words('english')) | |
self.lemmatizer = WordNetLemmatizer() | |
def preprocess_text(self, text): | |
if not isinstance(text, str): | |
return "" | |
text = text.lower() | |
text = re.sub(r'[^\w\s]', ' ', text) | |
tokens = wordpunct_tokenize(text) | |
tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words] | |
return ' '.join(tokens) | |
def load_model(self, folder_path=RECOMMENDER_MODEL_PATH): | |
try: | |
print("π Loading recommender model...") | |
if not os.path.exists(folder_path): | |
print("β Recommender folder not found") | |
return False | |
with open(os.path.join(folder_path, "config.pkl"), 'rb') as f: | |
config = pickle.load(f) | |
self.model_name = config['model_name'] | |
self.model = SentenceTransformer(os.path.join(folder_path, "sentence_transformer")) | |
with open(os.path.join(folder_path, "book_embeddings.pkl"), 'rb') as f: | |
self.book_embeddings = pickle.load(f) | |
with open(os.path.join(folder_path, "books_data.pkl"), 'rb') as f: | |
self.df = pickle.load(f) | |
print("β Recommender model loaded") | |
return True | |
except Exception as e: | |
logger.error(f"Error loading model: {str(e)}", exc_info=True) | |
return False | |
def recommend_books(self, user_query, top_n=5, include_description=True): | |
if self.model is None or self.book_embeddings is None or self.df is None: | |
return [] | |
try: | |
processed_query = self.preprocess_text(user_query) | |
user_embedding = self.model.encode([processed_query]) | |
similarities = cosine_similarity(user_embedding, self.book_embeddings)[0] | |
similar_books_idx = np.argsort(similarities)[-top_n:][::-1] | |
recommendations = [] | |
for i, idx in enumerate(similar_books_idx): | |
book_data = { | |
'title': str(self.df.iloc[idx].get('Title', '')), | |
'author': str(self.df.iloc[idx].get('Authors', '')), | |
'category': str(self.df.iloc[idx].get('Category', '')), | |
'year': str(self.df.iloc[idx].get('Publish Date (Year)', '')), | |
'description': str(self.df.iloc[idx].get('Description', '')[:197] + "...") if include_description and 'Description' in self.df.columns else '', | |
'relevance_score': float(similarities[idx]), | |
'rank': int(i + 1) | |
} | |
recommendations.append(book_data) | |
return recommendations | |
except Exception as e: | |
logger.error(f"Error generating recommendations: {str(e)}", exc_info=True) | |
return [] | |
def load_ood_thresholds(model_path): | |
threshold_path = os.path.join(model_path, "ood_thresholds.json") | |
if os.path.exists(threshold_path): | |
with open(threshold_path, "r") as f: | |
return json.load(f) | |
return {"energy_threshold": 0.0, "msp_threshold": 0.5} | |
def load_intent_resources(): | |
global intent_model, intent_tokenizer, intent_classes, intent_thresholds | |
try: | |
print("β³ Loading intent model from Hugging Face Hub (ZEROTSUDIOS/Bipa-Classification)") | |
intent_model = AutoModelForSequenceClassification.from_pretrained("ZEROTSUDIOS/Bipa-Classification") | |
intent_tokenizer = AutoTokenizer.from_pretrained("ZEROTSUDIOS/Bipa-Classification") | |
print("β Remote model and tokenizer loaded") | |
with open(os.path.join(INTENT_MODEL_PATH, "intent_classes.pkl"), "rb") as f: | |
intent_classes = pickle.load(f) | |
intent_thresholds = load_ood_thresholds(INTENT_MODEL_PATH) | |
return True | |
except Exception as e: | |
print("β Failed to load intent resources:", e) | |
logger.error(f"Failed to load intent resources: {str(e)}", exc_info=True) | |
return False | |
def predict_intent(text, method='combined'): | |
inputs = intent_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
with torch.no_grad(): | |
outputs = intent_model(**inputs) | |
logits = outputs.logits | |
probs = torch.nn.functional.softmax(logits, dim=-1) | |
max_prob, pred_idx = torch.max(probs, dim=-1) | |
energy = -torch.logsumexp(logits, dim=-1) | |
is_ood = False | |
if method == 'energy': | |
is_ood = energy.item() > intent_thresholds['energy_threshold'] | |
elif method == 'msp': | |
is_ood = max_prob.item() < intent_thresholds['msp_threshold'] | |
elif method == 'combined': | |
is_ood = (energy.item() > intent_thresholds['energy_threshold']) and (max_prob.item() < intent_thresholds['msp_threshold']) | |
return { | |
"intent": intent_classes[pred_idx.item()], | |
"is_ood": is_ood, | |
"confidence": max_prob.item(), | |
"energy_score": energy.item(), | |
"class_probabilities": { | |
intent_classes[i]: float(prob) | |
for i, prob in enumerate(probs[0].numpy()) | |
} | |
} | |
def root(): | |
return jsonify({"status": "ok"}) | |
def health(): | |
return jsonify({ | |
"status": "running", | |
"intent_model_loaded": intent_model is not None, | |
"tokenizer_loaded": intent_tokenizer is not None, | |
"recommender_loaded": recommender_model_loaded | |
}) | |
def analyze(): | |
if not request.is_json: | |
return jsonify({"error": "Request must be JSON"}), 400 | |
data = request.get_json() | |
text = data.get('text') | |
method = data.get('method', 'combined') | |
result = predict_intent(text, method) | |
return jsonify(result) | |
def recommend(): | |
global recommender_model_loaded | |
if not recommender_model_loaded: | |
return jsonify({"error": "Recommendation model not loaded."}), 503 | |
data = request.get_json() | |
query = data.get('query') | |
top_n = data.get('top_n', 5) | |
include_description = data.get('include_description', True) | |
threshold = data.get('threshold', 0.5) | |
if not query: | |
return jsonify({"error": "Missing query."}), 400 | |
recommendations = recommender.recommend_books(query, top_n=top_n, include_description=include_description) | |
high_score = [rec for rec in recommendations if rec['relevance_score'] >= threshold] | |
low_score = [rec for rec in recommendations if rec['relevance_score'] < threshold] | |
return jsonify({ | |
"query": query, | |
"threshold": threshold, | |
"high_recommendations": high_score, | |
"low_recommendations": low_score, | |
"total_count": len(recommendations), | |
"high_count": len(high_score), | |
"low_count": len(low_score) | |
}) | |
print("βοΈ Initializing models...") | |
load_intent_resources() | |
recommender = BookRecommender() | |
recommender_model_loaded = recommender.load_model() | |
print("β All models attempted to load") | |