Spaces:
Sleeping
Sleeping
import os | |
import faiss | |
import numpy as np | |
from rank_bm25 import BM25Okapi | |
import torch | |
import pandas as pd | |
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AutoTokenizer, AutoModel | |
# Set Hugging Face cache directory | |
os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface" | |
app = FastAPI() | |
# Ensure the correct file path | |
DATASET_PATH = os.path.join(os.getcwd(), "springer_papers_DL.json") | |
if not os.path.exists(DATASET_PATH): | |
raise FileNotFoundError(f"Dataset file not found at {DATASET_PATH}") | |
# Load dataset | |
df = pd.read_json(DATASET_PATH) | |
# Clean text function | |
def clean_text(text): | |
return text.strip().lower() | |
df['cleaned_abstract'] = df['abstract'].apply(clean_text) | |
# Precompute BM25 Index | |
tokenized_corpus = [paper.split() for paper in df["cleaned_abstract"]] | |
bm25 = BM25Okapi(tokenized_corpus) | |
# Load FAISS model | |
embedding_model = "allenai/scibert_scivocab_uncased" | |
tokenizer = AutoTokenizer.from_pretrained(embedding_model) | |
model = AutoModel.from_pretrained(embedding_model) | |
# Generate embeddings using SciBERT | |
def generate_embeddings_sci_bert(texts, batch_size=32): | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
all_embeddings = [] | |
for i in range(0, len(texts), batch_size): | |
batch = texts[i:i + batch_size] | |
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
inputs = {key: val.to(device) for key, val in inputs.items()} | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state.mean(dim=1) | |
all_embeddings.append(embeddings.cpu().numpy()) | |
return np.concatenate(all_embeddings, axis=0) | |
# Compute document embeddings | |
abstracts = df["cleaned_abstract"].tolist() | |
embeddings = generate_embeddings_sci_bert(abstracts, batch_size=32) | |
# Initialize FAISS index | |
dimension = embeddings.shape[1] | |
faiss_index = faiss.IndexFlatL2(dimension) | |
faiss_index.add(embeddings.astype(np.float32)) | |
# API Request Model | |
class InputText(BaseModel): | |
query: str | |
top_k: int = 5 | |
async def predict(data: InputText): | |
query = data.query | |
top_k = data.top_k | |
if not query.strip(): | |
return {"error": "Query is empty. Please enter a valid search query."} | |
# 1️⃣ Generate embedding for query | |
query_embedding = generate_embeddings_sci_bert([query], batch_size=1) | |
# 2️⃣ Perform FAISS similarity search | |
distances, indices = faiss_index.search(query_embedding.astype(np.float32), top_k) | |
# 3️⃣ Perform BM25 keyword search | |
tokenized_query = query.split() | |
bm25_scores = bm25.get_scores(tokenized_query) | |
bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k] | |
# 4️⃣ Combine FAISS and BM25 results | |
combined_indices = list(set(indices[0]) | set(bm25_top_indices)) | |
ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx]) | |
# 5️⃣ Retrieve research papers | |
relevant_papers = [] | |
for i, index in enumerate(ranked_results[:top_k]): | |
paper = df.iloc[index] | |
relevant_papers.append({ | |
"rank": i + 1, | |
"title": paper["title"], | |
"authors": paper["authors"], | |
"abstract": paper["cleaned_abstract"] | |
}) | |
return {"results": relevant_papers} | |
# Run FastAPI | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0") | |