Spaces:
Running
Running
from fastapi import FastAPI, HTTPException | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import pandas as pd | |
import os | |
import random | |
app = FastAPI() | |
# Load SentenceTransformer model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Load FAISS index | |
faiss_index_path = "novel_index.index" | |
if os.path.exists(faiss_index_path): | |
faiss_index = faiss.read_index(faiss_index_path) | |
else: | |
raise HTTPException(status_code=500, detail="Failed to load FAISS index.") | |
# Load DataFrame | |
csv_path = "novel_df4.csv" | |
if os.path.exists(csv_path): | |
df = pd.read_csv(csv_path) | |
else: | |
raise HTTPException(status_code=500, detail="Failed to load CSV file.") | |
def find_novel_keywords(novel_name, df): | |
keywords = df.loc[df['name'].str.contains(novel_name, case=False, na=False), 'keyword'].values | |
if len(keywords) > 0: | |
return keywords[0] | |
else: | |
return None | |
def encode_and_search(keywords, index, model, k=10): | |
if keywords is None: | |
keywords = "" | |
elif not isinstance(keywords, str): | |
keywords = str(keywords) | |
query_embedding = model.encode([keywords])[0] | |
query_embedding = query_embedding.reshape(1, -1) | |
_, indices = index.search(query_embedding, k) | |
return indices | |
def process_search_results(indices, df, novel_name): | |
result_df = df.iloc[indices.ravel()] | |
result_df = result_df[result_df['name'].str.lower() != novel_name.lower()] | |
if result_df.empty: | |
return [] | |
result_df.loc[:, 'genre'] = result_df['genre'].apply(lambda x: x.split(',') if isinstance(x, str) else []) | |
result_df.loc[:, 'tag'] = result_df['tag'].apply(lambda x: x.split(',') if isinstance(x, str) else []) | |
results = result_df[['name', 'genre','tag','image_url','source']].to_dict(orient='records') | |
# Remove duplicates | |
unique_results = [] | |
seen_names = set() | |
for result in results: | |
if result['name'] not in seen_names: | |
unique_results.append(result) | |
seen_names.add(result['name']) | |
random.shuffle(unique_results) | |
return unique_results | |
def search_similar_by_novel_name(novel_name, index, df, model, k=10): | |
keywords = find_novel_keywords(novel_name, df) | |
indices = encode_and_search(keywords, index, model, k) | |
return process_search_results(indices, df, novel_name) | |
async def search_similar_novels(novel_name: str): | |
try: | |
similar_novels = search_similar_by_novel_name(novel_name, faiss_index, df, model, 30) | |
if not similar_novels: | |
raise HTTPException(status_code=404, detail="Novel not found or no similar novels found.") | |
return {"similar_novels": similar_novels} | |
except HTTPException as e: | |
raise e | |
except Exception as e: | |
raise HTTPException(status_code=500, detail="Failed to search for similar novels.") |