import os os.environ["HF_HOME"] = "/tmp/huggingface" from flask import Flask, request, jsonify from flask_cors import CORS from transformers import AutoModelForSequenceClassification, AutoTokenizer from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import torch import numpy as np import pickle import json import logging import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import wordpunct_tokenize nltk_data_path = os.path.join(os.path.dirname(__file__), "nltk_data") os.makedirs(nltk_data_path, exist_ok=True) nltk.data.path.append(nltk_data_path) nltk.download('stopwords', download_dir=nltk_data_path, quiet=True) nltk.download('wordnet', download_dir=nltk_data_path, quiet=True) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) app = Flask(__name__) CORS(app) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INTENT_MODEL_PATH = os.path.join(BASE_DIR, "model") RECOMMENDER_MODEL_PATH = os.path.join(BASE_DIR, "recommender_model") intent_model = None intent_tokenizer = None intent_classes = None intent_thresholds = None recommender = None recommender_model_loaded = False class BookRecommender: def __init__(self, model_name='all-minilm-l6-v2'): self.model_name = model_name self.model = None self.book_embeddings = None self.df = None self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() def preprocess_text(self, text): if not isinstance(text, str): return "" text = text.lower() text = re.sub(r'[^\w\s]', ' ', text) tokens = wordpunct_tokenize(text) tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words] return ' '.join(tokens) def load_model(self, folder_path=RECOMMENDER_MODEL_PATH): try: print("🔄 Loading recommender model...") if not os.path.exists(folder_path): print("❌ Recommender folder not found") return False with open(os.path.join(folder_path, "config.pkl"), 'rb') as f: config = pickle.load(f) self.model_name = config['model_name'] self.model = SentenceTransformer(os.path.join(folder_path, "sentence_transformer")) with open(os.path.join(folder_path, "book_embeddings.pkl"), 'rb') as f: self.book_embeddings = pickle.load(f) with open(os.path.join(folder_path, "books_data.pkl"), 'rb') as f: self.df = pickle.load(f) print("✅ Recommender model loaded") return True except Exception as e: logger.error(f"Error loading model: {str(e)}", exc_info=True) return False def recommend_books(self, user_query, top_n=5, include_description=True): if self.model is None or self.book_embeddings is None or self.df is None: return [] try: processed_query = self.preprocess_text(user_query) user_embedding = self.model.encode([processed_query]) similarities = cosine_similarity(user_embedding, self.book_embeddings)[0] similar_books_idx = np.argsort(similarities)[-top_n:][::-1] recommendations = [] for i, idx in enumerate(similar_books_idx): book_data = { 'title': str(self.df.iloc[idx].get('Title', '')), 'author': str(self.df.iloc[idx].get('Authors', '')), 'category': str(self.df.iloc[idx].get('Category', '')), 'year': str(self.df.iloc[idx].get('Publish Date (Year)', '')), 'description': str(self.df.iloc[idx].get('Description', '')[:197] + "...") if include_description and 'Description' in self.df.columns else '', 'relevance_score': float(similarities[idx]), 'rank': int(i + 1) } recommendations.append(book_data) return recommendations except Exception as e: logger.error(f"Error generating recommendations: {str(e)}", exc_info=True) return [] def load_ood_thresholds(model_path): threshold_path = os.path.join(model_path, "ood_thresholds.json") if os.path.exists(threshold_path): with open(threshold_path, "r") as f: return json.load(f) return {"energy_threshold": 0.0, "msp_threshold": 0.5} def load_intent_resources(): global intent_model, intent_tokenizer, intent_classes, intent_thresholds try: print("⏳ Loading intent model from Hugging Face Hub (ZEROTSUDIOS/Bipa-Classification)") intent_model = AutoModelForSequenceClassification.from_pretrained("ZEROTSUDIOS/Bipa-Classification") intent_tokenizer = AutoTokenizer.from_pretrained("ZEROTSUDIOS/Bipa-Classification") print("✅ Remote model and tokenizer loaded") with open(os.path.join(INTENT_MODEL_PATH, "intent_classes.pkl"), "rb") as f: intent_classes = pickle.load(f) intent_thresholds = load_ood_thresholds(INTENT_MODEL_PATH) return True except Exception as e: print("❌ Failed to load intent resources:", e) logger.error(f"Failed to load intent resources: {str(e)}", exc_info=True) return False def predict_intent(text, method='combined'): inputs = intent_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): outputs = intent_model(**inputs) logits = outputs.logits probs = torch.nn.functional.softmax(logits, dim=-1) max_prob, pred_idx = torch.max(probs, dim=-1) energy = -torch.logsumexp(logits, dim=-1) is_ood = False if method == 'energy': is_ood = energy.item() > intent_thresholds['energy_threshold'] elif method == 'msp': is_ood = max_prob.item() < intent_thresholds['msp_threshold'] elif method == 'combined': is_ood = (energy.item() > intent_thresholds['energy_threshold']) and (max_prob.item() < intent_thresholds['msp_threshold']) return { "intent": intent_classes[pred_idx.item()], "is_ood": is_ood, "confidence": max_prob.item(), "energy_score": energy.item(), "class_probabilities": { intent_classes[i]: float(prob) for i, prob in enumerate(probs[0].numpy()) } } @app.route('/') def root(): return jsonify({"status": "ok"}) @app.route('/api/health') def health(): return jsonify({ "status": "running", "intent_model_loaded": intent_model is not None, "tokenizer_loaded": intent_tokenizer is not None, "recommender_loaded": recommender_model_loaded }) @app.route('/api/analyze', methods=['POST']) def analyze(): if not request.is_json: return jsonify({"error": "Request must be JSON"}), 400 data = request.get_json() text = data.get('text') method = data.get('method', 'combined') result = predict_intent(text, method) return jsonify(result) @app.route('/api/recommend', methods=['POST']) def recommend(): global recommender_model_loaded if not recommender_model_loaded: return jsonify({"error": "Recommendation model not loaded."}), 503 data = request.get_json() query = data.get('query') top_n = data.get('top_n', 5) include_description = data.get('include_description', True) threshold = data.get('threshold', 0.5) if not query: return jsonify({"error": "Missing query."}), 400 recommendations = recommender.recommend_books(query, top_n=top_n, include_description=include_description) high_score = [rec for rec in recommendations if rec['relevance_score'] >= threshold] low_score = [rec for rec in recommendations if rec['relevance_score'] < threshold] return jsonify({ "query": query, "threshold": threshold, "high_recommendations": high_score, "low_recommendations": low_score, "total_count": len(recommendations), "high_count": len(high_score), "low_count": len(low_score) }) print("⚙️ Initializing models...") load_intent_resources() recommender = BookRecommender() recommender_model_loaded = recommender.load_model() print("✅ All models attempted to load")