|
|
| import base64 |
| import traceback |
| import faiss |
| from fastapi import FastAPI, HTTPException |
| import requests |
| from pydantic import BaseModel |
| import numpy as np |
| import pandas as pd |
| import os |
|
|
| |
| app = FastAPI() |
| HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") |
|
|
| |
| API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2" |
| HEADERS = { |
| "Authorization": f"Bearer {HUGGINGFACE_API_KEY}", |
| "Content-Type": "application/json; charset=UTF-8", |
| } |
|
|
| |
| global_embedding = None |
| index = faiss.read_index("news_index.faissFF") |
|
|
|
|
| @app.get('/') |
| def home(): |
| return {"Message": "Hello"} |
|
|
| |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
|
|
|
|
| |
| |
|
|
| |
| class EmbeddingRequest(BaseModel): |
| text: str |
|
|
| |
| def get_embedding(text: str): |
| try: |
| response = requests.post(API_URL, headers=HEADERS, json={"inputs": text}) |
|
|
| if response.status_code != 200: |
| raise HTTPException(status_code=response.status_code, detail=response.json()) |
|
|
| return response.json() |
|
|
| except requests.RequestException as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
|
| print(f"FAISS index size: {index.ntotal}") |
|
|
| news_df = pd.read_csv("news_dataset.csv") |
|
|
|
|
|
|
|
|
|
|
| @app.post("/get_Emd_Corrected") |
| async def generate_embedding(request: EmbeddingRequest): |
| try: |
| embedding = np.array(get_embedding(request.text), dtype="float32") |
|
|
| if embedding.shape[0] != 384: |
| return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"} |
|
|
| embedding_query = embedding.reshape(1, -1) |
|
|
| if index is None: |
| return {"error": "FAISS index not loaded"} |
|
|
| k = 10 |
| distances, indices = index.search(embedding_query, k) |
|
|
| |
| results = [] |
| for i, idx in enumerate(indices[0]): |
| if idx < len(news_df): |
| article = news_df.iloc[idx].to_dict() |
| article["distance"] = float(distances[0][i]) |
| results.append(article) |
|
|
| |
| return { |
| "embedding": embedding.tolist(), |
| "Distances": distances.tolist(), |
| "Indices": indices.tolist(), |
| "results": results |
| } |
|
|
| except Exception as e: |
| return {"error": str(e), "traceback": traceback.format_exc()} |
|
|
| import re |
|
|
| def clean_arabic_text(text): |
| """Removes invalid characters that cause JSON decoding errors""" |
| text = re.sub(r"[\x00-\x1F\x7F\u202c\ufeff]", "", text) |
| return text.strip() |
|
|
|
|
| @app.post("/get_Emd_Data") |
| async def generate_embedding(request: EmbeddingRequest): |
| try: |
| request.text = clean_arabic_text(request.text) |
| encoded_text = base64.b64encode(request.text.encode()).decode() |
|
|
| |
| embedding = np.array(get_embedding(encoded_text), dtype="float32") |
|
|
| if embedding.shape[0] != 384: |
| return {"error": f"Expected embedding of size 384, got {embedding.shape[0]}"} |
|
|
| |
| embedding_query = embedding.reshape(1, -1) |
|
|
| |
| if index is None: |
| return {"error": "FAISS index not loaded"} |
|
|
| |
| k = 10 |
| distances, indices = index.search(embedding_query, k) |
|
|
| |
| results = [] |
| for i, idx in enumerate(indices[0]): |
| if idx < len(news_df): |
| article = news_df.iloc[idx].to_dict() |
| article["distance"] = float(distances[0][i]) |
| results.append(article) |
|
|
| return {"results": results} |
|
|
| except Exception as e: |
| return {"error": str(e), "traceback": traceback.format_exc()} |
|
|
|
|
|
|
| |
| @app.get("/last-embedding") |
| async def get_last_embedding(): |
| if global_embedding is None: |
| raise HTTPException(status_code=404, detail="No embedding stored yet") |
| return {"last_embedding": global_embedding} |
|
|
|
|
|
|