File size: 8,538 Bytes
6522efc
e1f7ca9
6522efc
282c65d
a70684f
 
 
 
 
 
 
 
 
 
 
 
 
e1f7ca9
a70684f
 
 
 
b9a5070
 
a70684f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1f7ca9
a70684f
 
 
 
 
9e2d5fc
a70684f
9e2d5fc
a70684f
 
 
 
 
 
 
 
 
9e2d5fc
a70684f
 
 
 
 
209b7fa
 
 
 
 
 
 
 
 
 
 
ca7fe96
 
 
 
 
209b7fa
ca7fe96
209b7fa
 
 
 
 
 
 
a70684f
 
 
 
 
 
 
 
 
 
39ce8c5
 
 
5f6f892
a70684f
 
 
 
 
9e2d5fc
a70684f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba1cce4
 
 
 
 
a70684f
 
ba1cce4
b9a5070
88b518c
cb9f4da
b9a5070
 
 
cb9f4da
 
 
 
 
 
b9a5070
a70684f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2d5fc
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import os
os.environ["HF_HOME"] = "/tmp/huggingface"

from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
import pickle
import json
import logging
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize

nltk_data_path = os.path.join(os.path.dirname(__file__), "nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path, quiet=True)
nltk.download('wordnet', download_dir=nltk_data_path, quiet=True)

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

app = Flask(__name__)
CORS(app)

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INTENT_MODEL_PATH = os.path.join(BASE_DIR, "model")
RECOMMENDER_MODEL_PATH = os.path.join(BASE_DIR, "recommender_model")

intent_model = None
intent_tokenizer = None
intent_classes = None
intent_thresholds = None
recommender = None
recommender_model_loaded = False

class BookRecommender:
    def __init__(self, model_name='all-minilm-l6-v2'):
        self.model_name = model_name
        self.model = None
        self.book_embeddings = None
        self.df = None
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        tokens = wordpunct_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
        return ' '.join(tokens)

    def load_model(self, folder_path=RECOMMENDER_MODEL_PATH):
        try:
            print("πŸ”„ Loading recommender model...")
            if not os.path.exists(folder_path):
                print("❌ Recommender folder not found")
                return False
            with open(os.path.join(folder_path, "config.pkl"), 'rb') as f:
                config = pickle.load(f)
            self.model_name = config['model_name']
            self.model = SentenceTransformer(os.path.join(folder_path, "sentence_transformer"))
            with open(os.path.join(folder_path, "book_embeddings.pkl"), 'rb') as f:
                self.book_embeddings = pickle.load(f)
            with open(os.path.join(folder_path, "books_data.pkl"), 'rb') as f:
                self.df = pickle.load(f)
            print("βœ… Recommender model loaded")
            return True
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}", exc_info=True)
            return False

    def recommend_books(self, user_query, top_n=5, include_description=True):
        if self.model is None or self.book_embeddings is None or self.df is None:
            return []
        try:
            processed_query = self.preprocess_text(user_query)
            user_embedding = self.model.encode([processed_query])
            similarities = cosine_similarity(user_embedding, self.book_embeddings)[0]
            similar_books_idx = np.argsort(similarities)[-top_n:][::-1]
            recommendations = []
            for i, idx in enumerate(similar_books_idx):
                book_data = {
                    'title': str(self.df.iloc[idx].get('Title', '')),
                    'author': str(self.df.iloc[idx].get('Authors', '')),
                    'category': str(self.df.iloc[idx].get('Category', '')),
                    'year': str(self.df.iloc[idx].get('Publish Date (Year)', '')),
                    'description': str(self.df.iloc[idx].get('Description', '')[:197] + "...") if include_description and 'Description' in self.df.columns else '',
                    'relevance_score': float(similarities[idx]),
                    'rank': int(i + 1)
                }
                recommendations.append(book_data)
            return recommendations
        except Exception as e:
            logger.error(f"Error generating recommendations: {str(e)}", exc_info=True)
            return []

def load_ood_thresholds(model_path):
    threshold_path = os.path.join(model_path, "ood_thresholds.json")
    if os.path.exists(threshold_path):
        with open(threshold_path, "r") as f:
            return json.load(f)
    return {"energy_threshold": 0.0, "msp_threshold": 0.5}

def load_intent_resources():
    global intent_model, intent_tokenizer, intent_classes, intent_thresholds
    try:
        print("⏳ Loading intent model from Hugging Face Hub (ZEROTSUDIOS/Bipa-Classification)")
        intent_model = AutoModelForSequenceClassification.from_pretrained("ZEROTSUDIOS/Bipa-Classification")
        intent_tokenizer = AutoTokenizer.from_pretrained("ZEROTSUDIOS/Bipa-Classification")
        print("βœ… Remote model and tokenizer loaded")
        with open(os.path.join(INTENT_MODEL_PATH, "intent_classes.pkl"), "rb") as f:
            intent_classes = pickle.load(f)
        intent_thresholds = load_ood_thresholds(INTENT_MODEL_PATH)
        return True
    except Exception as e:
        print("❌ Failed to load intent resources:", e)
        logger.error(f"Failed to load intent resources: {str(e)}", exc_info=True)
        return False

def predict_intent(text, method='combined'):
    inputs = intent_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = intent_model(**inputs)
        logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    max_prob, pred_idx = torch.max(probs, dim=-1)
    energy = -torch.logsumexp(logits, dim=-1)
    is_ood = False
    if method == 'energy':
        is_ood = energy.item() > intent_thresholds['energy_threshold']
    elif method == 'msp':
        is_ood = max_prob.item() < intent_thresholds['msp_threshold']
    elif method == 'combined':
        is_ood = (energy.item() > intent_thresholds['energy_threshold']) and (max_prob.item() < intent_thresholds['msp_threshold'])
    return {
        "intent": intent_classes[pred_idx.item()],
        "is_ood": is_ood,
        "confidence": max_prob.item(),
        "energy_score": energy.item(),
        "class_probabilities": {
            intent_classes[i]: float(prob)
            for i, prob in enumerate(probs[0].numpy())
        }
    }


@app.route('/')
def root():
    return jsonify({"status": "ok"})

@app.route('/api/health')
def health():
    return jsonify({
        "status": "running",
        "intent_model_loaded": intent_model is not None,
        "tokenizer_loaded": intent_tokenizer is not None,
        "recommender_loaded": recommender_model_loaded
    })

@app.route('/api/analyze', methods=['POST'])
def analyze():
    if not request.is_json:
        return jsonify({"error": "Request must be JSON"}), 400
    data = request.get_json()
    text = data.get('text')
    method = data.get('method', 'combined')
    result = predict_intent(text, method)
    return jsonify(result)

@app.route('/api/recommend', methods=['POST'])
def recommend():
    global recommender_model_loaded
    if not recommender_model_loaded:
        return jsonify({"error": "Recommendation model not loaded."}), 503
    data = request.get_json()
    query = data.get('query')
    top_n = data.get('top_n', 5)
    include_description = data.get('include_description', True)
    threshold = data.get('threshold', 0.5)
    if not query:
        return jsonify({"error": "Missing query."}), 400
    recommendations = recommender.recommend_books(query, top_n=top_n, include_description=include_description)
    high_score = [rec for rec in recommendations if rec['relevance_score'] >= threshold]
    low_score = [rec for rec in recommendations if rec['relevance_score'] < threshold]
    return jsonify({
        "query": query,
        "threshold": threshold,
        "high_recommendations": high_score,
        "low_recommendations": low_score,
        "total_count": len(recommendations),
        "high_count": len(high_score),
        "low_count": len(low_score)
    })

print("βš™οΈ Initializing models...")
load_intent_resources()
recommender = BookRecommender()
recommender_model_loaded = recommender.load_model()
print("βœ… All models attempted to load")