Spaces:
Running
Running
#TAB : VIEW PPT.py | |
import requests | |
from sentence_transformers import SentenceTransformer, CrossEncoder | |
from sklearn.metrics.pairwise import cosine_similarity | |
import os | |
import shutil | |
import gradio as gr | |
# Local cache directory for downloaded files | |
LOCAL_CACHE_DIR = "local_cache" | |
os.makedirs(LOCAL_CACHE_DIR, exist_ok=True) | |
# Function to download a file from OneDrive to the local cache | |
def download_file_from_onedrive(file_path, file_id, headers): | |
local_file_path = os.path.join(LOCAL_CACHE_DIR, os.path.basename(file_path)) | |
if not os.path.exists(local_file_path): # Avoid re-downloading | |
download_url = f"https://graph.microsoft.com/v1.0/me/drive/items/{file_id}/content" | |
response = requests.get(download_url, headers=headers) | |
if response.status_code != 200: | |
raise ValueError(f"Failed to download file {file_path}. Error: {response.text}") | |
with open(local_file_path, "wb") as f: | |
f.write(response.content) | |
print(f"β Downloaded: {file_path} -> {local_file_path}") | |
return local_file_path | |
# Function to search PPTs | |
def search_ppts(query, num_results): | |
global df | |
gr.Info("Searching the relevant PPTs .") | |
# Generate query embedding | |
query_embedding = embedding_model.encode(query).tolist() | |
# Filter the DataFrame to include only rows where Unique_Slide_ID ends with "slide_1" | |
df1 = df[df['Unique_Slide_ID'].str.endswith("slide_1", na=False)] | |
# Compute cosine similarity scores | |
df1['similarity'] = df1['Short_Summary_Embedding'].apply( | |
lambda x: cosine_similarity([query_embedding], [eval(x)])[0][0] | |
) | |
# Sort by cosine similarity score | |
df1 = df1.sort_values(by='similarity', ascending=False) | |
# Get top N results for reranking | |
top_n = min(50, len(df1)) # Take top 50 results for reranking | |
top_results = df1.head(top_n) | |
# Prepare input pairs for cross-encoder reranking | |
pairs = [(query, row['Short_Summary']) for _, row in top_results.iterrows()] | |
# Rerank using cross-encoder | |
gr.Info("Doing Semantic Reranking for most appropriate results ") | |
rerank_scores = cross_encoder.predict(pairs) | |
top_results = top_results.copy() # Avoid SettingWithCopyWarning | |
top_results['rerank_score'] = rerank_scores | |
# Sort by rerank score | |
top_results = top_results.sort_values(by='rerank_score', ascending=False) | |
print(top_results) | |
# Prepare results | |
results = [] | |
gr.Info('Downloading PPT images and ppt') | |
print('Downloading PPT images and ppt') | |
for _, row in top_results.head(num_results).iterrows(): | |
# Download slide image locally | |
slide_image_path = download_file_from_onedrive( | |
row['Thumbnail_File_Path'], row['Thumbnail_File_ID'], headers | |
) | |
# Download full PPT locally | |
ppt_download_link = download_file_from_onedrive( | |
row['Full_PPT_File_Path'], row['Full_PPT_File_ID'], headers | |
) | |
title = row['Suitable_Title'] | |
owner = row['PPT_Owner'] | |
category = row['Slide_Category'] | |
summary = row['Short_Summary'] | |
results.append({ | |
"image": slide_image_path, | |
"title": title, | |
"owner": owner, | |
"category": category, | |
"summary": summary, | |
"download_link": ppt_download_link | |
}) | |
print("downloading complete ") | |
# Update visibility of rows | |
visible_rows = min(len(results), num_results) | |
row_updates = [] | |
row_updates = [] | |
for i in range(20): | |
if i < len(results): | |
result = results[i] | |
row_updates.extend([ | |
gr.update(visible=True), # β Make the row visible | |
gr.update(value=result["image"], visible=True), | |
gr.update(value=f"<b>Title:</b> {result['title']}<br><b>Owner:</b> {result['owner']}<br><b>Category:</b> {result['category']}", visible=True), | |
gr.update(value=result["summary"], visible=True), | |
gr.update(value=result["download_link"], visible=True), | |
]) | |
else: | |
row_updates.extend([gr.update(visible=False)] * 5) # row + 4 components | |
return row_updates | |