NCTC_SlideFinder / view_ppt.py
NCTCMumbai's picture
Update view_ppt.py
9198e15 verified
#TAB : VIEW PPT.py
import requests
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import os
import shutil
import gradio as gr
# Local cache directory for downloaded files
LOCAL_CACHE_DIR = "local_cache"
os.makedirs(LOCAL_CACHE_DIR, exist_ok=True)
# Function to download a file from OneDrive to the local cache
def download_file_from_onedrive(file_path, file_id, headers):
local_file_path = os.path.join(LOCAL_CACHE_DIR, os.path.basename(file_path))
if not os.path.exists(local_file_path): # Avoid re-downloading
download_url = f"https://graph.microsoft.com/v1.0/me/drive/items/{file_id}/content"
response = requests.get(download_url, headers=headers)
if response.status_code != 200:
raise ValueError(f"Failed to download file {file_path}. Error: {response.text}")
with open(local_file_path, "wb") as f:
f.write(response.content)
print(f"βœ… Downloaded: {file_path} -> {local_file_path}")
return local_file_path
# Function to search PPTs
def search_ppts(query, num_results):
global df
gr.Info("Searching the relevant PPTs .")
# Generate query embedding
query_embedding = embedding_model.encode(query).tolist()
# Filter the DataFrame to include only rows where Unique_Slide_ID ends with "slide_1"
df1 = df[df['Unique_Slide_ID'].str.endswith("slide_1", na=False)]
# Compute cosine similarity scores
df1['similarity'] = df1['Short_Summary_Embedding'].apply(
lambda x: cosine_similarity([query_embedding], [eval(x)])[0][0]
)
# Sort by cosine similarity score
df1 = df1.sort_values(by='similarity', ascending=False)
# Get top N results for reranking
top_n = min(50, len(df1)) # Take top 50 results for reranking
top_results = df1.head(top_n)
# Prepare input pairs for cross-encoder reranking
pairs = [(query, row['Short_Summary']) for _, row in top_results.iterrows()]
# Rerank using cross-encoder
gr.Info("Doing Semantic Reranking for most appropriate results ")
rerank_scores = cross_encoder.predict(pairs)
top_results = top_results.copy() # Avoid SettingWithCopyWarning
top_results['rerank_score'] = rerank_scores
# Sort by rerank score
top_results = top_results.sort_values(by='rerank_score', ascending=False)
print(top_results)
# Prepare results
results = []
gr.Info('Downloading PPT images and ppt')
print('Downloading PPT images and ppt')
for _, row in top_results.head(num_results).iterrows():
# Download slide image locally
slide_image_path = download_file_from_onedrive(
row['Thumbnail_File_Path'], row['Thumbnail_File_ID'], headers
)
# Download full PPT locally
ppt_download_link = download_file_from_onedrive(
row['Full_PPT_File_Path'], row['Full_PPT_File_ID'], headers
)
title = row['Suitable_Title']
owner = row['PPT_Owner']
category = row['Slide_Category']
summary = row['Short_Summary']
results.append({
"image": slide_image_path,
"title": title,
"owner": owner,
"category": category,
"summary": summary,
"download_link": ppt_download_link
})
print("downloading complete ")
# Update visibility of rows
visible_rows = min(len(results), num_results)
row_updates = []
row_updates = []
for i in range(20):
if i < len(results):
result = results[i]
row_updates.extend([
gr.update(visible=True), # βœ… Make the row visible
gr.update(value=result["image"], visible=True),
gr.update(value=f"<b>Title:</b> {result['title']}<br><b>Owner:</b> {result['owner']}<br><b>Category:</b> {result['category']}", visible=True),
gr.update(value=result["summary"], visible=True),
gr.update(value=result["download_link"], visible=True),
])
else:
row_updates.extend([gr.update(visible=False)] * 5) # row + 4 components
return row_updates