|
|
from flask import Flask, request, jsonify |
|
|
from flask_cors import CORS |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from pinecone import Pinecone |
|
|
import os |
|
|
import logging |
|
|
import json |
|
|
|
|
|
|
|
|
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') |
|
|
|
|
|
app = Flask(__name__) |
|
|
CORS(app) |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
if not PINECONE_API_KEY: |
|
|
raise ValueError("PINECONE_API_KEY environment variable is required") |
|
|
|
|
|
|
|
|
pc = Pinecone(api_key=PINECONE_API_KEY) |
|
|
|
|
|
INDEX_NAME = "budget-proposals-optimized" |
|
|
|
|
|
|
|
|
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
def load_dynamic_metadata(): |
|
|
"""Load metadata from dynamic_metadata.json""" |
|
|
try: |
|
|
if os.path.exists("dynamic_metadata.json"): |
|
|
with open("dynamic_metadata.json", 'r', encoding='utf-8') as f: |
|
|
return json.load(f) |
|
|
except Exception as e: |
|
|
logger.error(f"Error loading dynamic metadata: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
DYNAMIC_METADATA = load_dynamic_metadata() |
|
|
|
|
|
def get_language_specific_data(proposal_data, field, language='en'): |
|
|
"""Get language-specific data from proposal metadata""" |
|
|
|
|
|
if isinstance(proposal_data.get(field), str): |
|
|
return proposal_data.get(field, '') |
|
|
|
|
|
|
|
|
if isinstance(proposal_data.get(field), dict): |
|
|
return proposal_data.get(field, {}).get(language, |
|
|
proposal_data.get(field, {}).get('en', '')) |
|
|
|
|
|
return '' |
|
|
|
|
|
def get_pinecone_index(): |
|
|
"""Get the budget proposals Pinecone index""" |
|
|
try: |
|
|
return pc.Index(INDEX_NAME) |
|
|
except Exception as e: |
|
|
logger.error(f"Error accessing Pinecone index: {e}") |
|
|
return None |
|
|
|
|
|
def semantic_search(query: str, top_k=1, category_filter=None, language='en'): |
|
|
"""Perform semantic search on budget proposals with multi-language support""" |
|
|
try: |
|
|
|
|
|
global DYNAMIC_METADATA |
|
|
DYNAMIC_METADATA = load_dynamic_metadata() |
|
|
|
|
|
pc_index = get_pinecone_index() |
|
|
if not pc_index: |
|
|
return [] |
|
|
|
|
|
query_emb = embed_model.encode(query).tolist() |
|
|
|
|
|
|
|
|
filter_dict = {"source": "budget_proposals"} |
|
|
if category_filter and category_filter != "All categories": |
|
|
filter_dict["category"] = category_filter |
|
|
|
|
|
|
|
|
res = pc_index.query( |
|
|
vector=query_emb, |
|
|
top_k=50, |
|
|
include_metadata=True, |
|
|
filter=filter_dict |
|
|
) |
|
|
|
|
|
|
|
|
best_scores = {} |
|
|
|
|
|
for match in res["matches"]: |
|
|
metadata = match["metadata"] |
|
|
score = match["score"] |
|
|
file_path = metadata.get("file_path", "") |
|
|
|
|
|
|
|
|
if file_path not in best_scores or score > best_scores[file_path]: |
|
|
best_scores[file_path] = score |
|
|
|
|
|
if not best_scores: |
|
|
return [] |
|
|
|
|
|
|
|
|
sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True) |
|
|
|
|
|
|
|
|
max_score = sorted_docs[0][1] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if max_score > 0.6: |
|
|
|
|
|
threshold = max_score * 0.8 |
|
|
max_docs = 2 |
|
|
elif max_score > 0.3: |
|
|
|
|
|
threshold = max_score * 0.7 |
|
|
max_docs = 3 |
|
|
else: |
|
|
|
|
|
threshold = max_score * 0.5 |
|
|
max_docs = 5 |
|
|
|
|
|
results = [] |
|
|
doc_count = 0 |
|
|
|
|
|
for file_path, score in sorted_docs: |
|
|
if doc_count >= max_docs or score < threshold: |
|
|
break |
|
|
|
|
|
|
|
|
for match in res["matches"]: |
|
|
metadata = match["metadata"] |
|
|
if metadata.get("file_path", "") == file_path: |
|
|
|
|
|
proposal_data = DYNAMIC_METADATA.get(file_path, { |
|
|
"title": metadata.get("title", "Unknown Title"), |
|
|
"summary": metadata.get("summary", ""), |
|
|
"category": metadata.get("category", "Budget Proposal"), |
|
|
"costLKR": metadata.get("costLKR", "No Costing Available") |
|
|
}) |
|
|
|
|
|
|
|
|
title = get_language_specific_data(proposal_data, "title", language) |
|
|
summary = get_language_specific_data(proposal_data, "summary", language) |
|
|
costLKR = get_language_specific_data(proposal_data, "costLKR", language) |
|
|
category = get_language_specific_data(proposal_data, "category", language) |
|
|
thumb_url = metadata.get("thumbUrl", "") |
|
|
|
|
|
result = { |
|
|
"title": title, |
|
|
"summary": summary, |
|
|
"costLKR": costLKR, |
|
|
"category": category, |
|
|
"pdfUrl": f"assets/pdfs/{file_path}" if file_path else "", |
|
|
"thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "", |
|
|
"score": score, |
|
|
"relevance_percentage": int(score * 100), |
|
|
"file_path": file_path, |
|
|
"id": match["id"], |
|
|
"content": metadata.get("content", "") |
|
|
} |
|
|
|
|
|
results.append(result) |
|
|
doc_count += 1 |
|
|
break |
|
|
|
|
|
return results |
|
|
except Exception as e: |
|
|
logger.error(f"Search error: {e}") |
|
|
return [] |
|
|
|
|
|
def get_all_proposals(category_filter=None, language='en'): |
|
|
"""Get all budget proposals with multi-language support""" |
|
|
try: |
|
|
|
|
|
global DYNAMIC_METADATA |
|
|
DYNAMIC_METADATA = load_dynamic_metadata() |
|
|
|
|
|
pc_index = get_pinecone_index() |
|
|
if not pc_index: |
|
|
logger.warning("Pinecone index not available, returning empty list") |
|
|
return [] |
|
|
|
|
|
|
|
|
filter_dict = {"source": "budget_proposals"} |
|
|
if category_filter and category_filter != "All categories": |
|
|
filter_dict["category"] = category_filter |
|
|
|
|
|
|
|
|
|
|
|
dummy_vector = [0.1] * 384 |
|
|
res = pc_index.query( |
|
|
vector=dummy_vector, |
|
|
top_k=100, |
|
|
include_metadata=True, |
|
|
filter=filter_dict |
|
|
) |
|
|
|
|
|
logger.info(f"Query returned {len(res['matches'])} matches") |
|
|
|
|
|
results = [] |
|
|
seen_files = set() |
|
|
|
|
|
for match in res["matches"]: |
|
|
metadata = match["metadata"] |
|
|
file_path = metadata.get("file_path", "") |
|
|
|
|
|
|
|
|
if file_path in seen_files: |
|
|
continue |
|
|
|
|
|
seen_files.add(file_path) |
|
|
|
|
|
|
|
|
proposal_data = DYNAMIC_METADATA.get(file_path, { |
|
|
"title": metadata.get("title", "Unknown Title"), |
|
|
"summary": metadata.get("summary", ""), |
|
|
"category": metadata.get("category", "Budget Proposal"), |
|
|
"costLKR": metadata.get("costLKR", "No Costing Available") |
|
|
}) |
|
|
|
|
|
|
|
|
title = get_language_specific_data(proposal_data, "title", language) |
|
|
summary = get_language_specific_data(proposal_data, "summary", language) |
|
|
costLKR = get_language_specific_data(proposal_data, "costLKR", language) |
|
|
category = get_language_specific_data(proposal_data, "category", language) |
|
|
thumb_url = metadata.get("thumbUrl", "") |
|
|
|
|
|
|
|
|
|
|
|
if (title and title.strip() and title not in ["Unknown", "Unknown Title"]): |
|
|
|
|
|
result = { |
|
|
"title": title, |
|
|
"summary": summary, |
|
|
"costLKR": costLKR, |
|
|
"category": category, |
|
|
"pdfUrl": f"assets/pdfs/{file_path}" if file_path else "", |
|
|
"thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "", |
|
|
"score": 1.0, |
|
|
"relevance_percentage": 100, |
|
|
"file_path": file_path, |
|
|
"id": match["id"] |
|
|
} |
|
|
|
|
|
results.append(result) |
|
|
|
|
|
return results |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error getting all proposals: {e}") |
|
|
return [] |
|
|
|
|
|
@app.route('/api/search', methods=['POST']) |
|
|
def search_proposals(): |
|
|
"""API endpoint for searching budget proposals with multi-language support""" |
|
|
try: |
|
|
data = request.get_json() |
|
|
query = data.get('query', '').strip() |
|
|
top_k = data.get('top_k', 10) |
|
|
category_filter = data.get('category_filter') |
|
|
language = data.get('language', 'en') |
|
|
|
|
|
if not query: |
|
|
|
|
|
results = get_all_proposals(category_filter, language) |
|
|
else: |
|
|
results = semantic_search(query, top_k, category_filter, language) |
|
|
|
|
|
return jsonify({ |
|
|
"query": query, |
|
|
"results": results, |
|
|
"total_results": len(results), |
|
|
"category_filter": category_filter, |
|
|
"language": language |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"API error: {e}") |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route('/api/search', methods=['GET']) |
|
|
def search_proposals_get(): |
|
|
"""API endpoint for searching proposals (GET method) with multi-language support""" |
|
|
try: |
|
|
query = request.args.get('query', '').strip() |
|
|
top_k = int(request.args.get('top_k', 10)) |
|
|
category_filter = request.args.get('category_filter') |
|
|
language = request.args.get('language', 'en') |
|
|
|
|
|
if not query: |
|
|
|
|
|
results = get_all_proposals(category_filter, language) |
|
|
else: |
|
|
results = semantic_search(query, top_k, category_filter, language) |
|
|
|
|
|
return jsonify({ |
|
|
"query": query, |
|
|
"results": results, |
|
|
"total_results": len(results), |
|
|
"category_filter": category_filter, |
|
|
"language": language |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"API error: {e}") |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route('/api/proposals', methods=['GET']) |
|
|
def get_proposals(): |
|
|
"""Get all budget proposals with multi-language support""" |
|
|
try: |
|
|
category_filter = request.args.get('category_filter') |
|
|
language = request.args.get('language', 'en') |
|
|
results = get_all_proposals(category_filter, language) |
|
|
|
|
|
return jsonify({ |
|
|
"results": results, |
|
|
"total_results": len(results), |
|
|
"category_filter": category_filter, |
|
|
"language": language |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"API error: {e}") |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route('/api/categories', methods=['GET']) |
|
|
def get_categories(): |
|
|
"""Get all available categories""" |
|
|
try: |
|
|
|
|
|
categories = set() |
|
|
for file_path, metadata in DYNAMIC_METADATA.items(): |
|
|
category = metadata.get("category") |
|
|
if category: |
|
|
categories.add(category) |
|
|
|
|
|
|
|
|
if not categories: |
|
|
all_proposals = get_all_proposals() |
|
|
for proposal in all_proposals: |
|
|
category = proposal.get("category") |
|
|
if category: |
|
|
categories.add(category) |
|
|
|
|
|
return jsonify({ |
|
|
"categories": sorted(list(categories)) |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"API error: {e}") |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route('/api/health', methods=['GET']) |
|
|
def health_check(): |
|
|
"""Health check endpoint""" |
|
|
try: |
|
|
pc_index = get_pinecone_index() |
|
|
if pc_index: |
|
|
stats = pc_index.describe_index_stats() |
|
|
return jsonify({ |
|
|
"status": "healthy", |
|
|
"message": "Budget proposals semantic search API is running", |
|
|
"index_stats": { |
|
|
"total_vector_count": stats.total_vector_count, |
|
|
"dimension": stats.dimension, |
|
|
"index_fullness": stats.index_fullness |
|
|
} |
|
|
}) |
|
|
else: |
|
|
return jsonify({ |
|
|
"status": "unhealthy", |
|
|
"message": "Cannot connect to Pinecone index" |
|
|
}), 500 |
|
|
except Exception as e: |
|
|
return jsonify({ |
|
|
"status": "unhealthy", |
|
|
"message": f"Error: {str(e)}" |
|
|
}), 500 |
|
|
|
|
|
@app.route('/api/stats', methods=['GET']) |
|
|
def get_stats(): |
|
|
"""Get index statistics""" |
|
|
try: |
|
|
pc_index = get_pinecone_index() |
|
|
if not pc_index: |
|
|
return jsonify({"error": "Cannot connect to Pinecone index"}), 500 |
|
|
|
|
|
stats = pc_index.describe_index_stats() |
|
|
return jsonify({ |
|
|
"total_vector_count": stats.total_vector_count, |
|
|
"dimension": stats.dimension, |
|
|
"index_fullness": stats.index_fullness |
|
|
}) |
|
|
except Exception as e: |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route('/', methods=['GET']) |
|
|
def home(): |
|
|
"""Home endpoint with API documentation""" |
|
|
return jsonify({ |
|
|
"message": "Budget Proposals Semantic Search API", |
|
|
"version": "1.0.0", |
|
|
"endpoints": { |
|
|
"POST /api/search": "Search proposals with JSON body", |
|
|
"GET /api/search?query=<search_term>": "Search proposals with query parameter", |
|
|
"GET /api/proposals": "Get all proposals", |
|
|
"GET /api/categories": "Get all categories", |
|
|
"GET /api/health": "Health check", |
|
|
"GET /api/stats": "Index statistics" |
|
|
}, |
|
|
"status": "running" |
|
|
}) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
app.run(debug=False, host='0.0.0.0', port=7860) |
|
|
|