Spaces:

NCTCMumbai
/

NCTC_SlideFinder

Running

App Files Files Community

NCTC_SlideFinder / view_ppt.py

NCTCMumbai

Update view_ppt.py

9198e15 verified 4 months ago

raw

history blame contribute delete

4.19 kB

	#TAB : VIEW PPT.py
	import requests
	from sentence_transformers import SentenceTransformer, CrossEncoder
	from sklearn.metrics.pairwise import cosine_similarity
	import os
	import shutil
	import gradio as gr
	# Local cache directory for downloaded files
	LOCAL_CACHE_DIR = "local_cache"
	os.makedirs(LOCAL_CACHE_DIR, exist_ok=True)

	# Function to download a file from OneDrive to the local cache
	def download_file_from_onedrive(file_path, file_id, headers):
	local_file_path = os.path.join(LOCAL_CACHE_DIR, os.path.basename(file_path))

	if not os.path.exists(local_file_path): # Avoid re-downloading
	download_url = f"https://graph.microsoft.com/v1.0/me/drive/items/{file_id}/content"
	response = requests.get(download_url, headers=headers)

	if response.status_code != 200:
	raise ValueError(f"Failed to download file {file_path}. Error: {response.text}")

	with open(local_file_path, "wb") as f:
	f.write(response.content)

	print(f"✅ Downloaded: {file_path} -> {local_file_path}")

	return local_file_path

	# Function to search PPTs
	def search_ppts(query, num_results):

	global df
	gr.Info("Searching the relevant PPTs .")
	# Generate query embedding
	query_embedding = embedding_model.encode(query).tolist()
	# Filter the DataFrame to include only rows where Unique_Slide_ID ends with "slide_1"
	df1 = df[df['Unique_Slide_ID'].str.endswith("slide_1", na=False)]
	# Compute cosine similarity scores
	df1['similarity'] = df1['Short_Summary_Embedding'].apply(
	lambda x: cosine_similarity([query_embedding], [eval(x)])[0][0]
	)

	# Sort by cosine similarity score
	df1 = df1.sort_values(by='similarity', ascending=False)

	# Get top N results for reranking
	top_n = min(50, len(df1)) # Take top 50 results for reranking
	top_results = df1.head(top_n)

	# Prepare input pairs for cross-encoder reranking
	pairs = [(query, row['Short_Summary']) for _, row in top_results.iterrows()]

	# Rerank using cross-encoder
	gr.Info("Doing Semantic Reranking for most appropriate results ")
	rerank_scores = cross_encoder.predict(pairs)
	top_results = top_results.copy() # Avoid SettingWithCopyWarning
	top_results['rerank_score'] = rerank_scores

	# Sort by rerank score
	top_results = top_results.sort_values(by='rerank_score', ascending=False)
	print(top_results)
	# Prepare results
	results = []
	gr.Info('Downloading PPT images and ppt')
	print('Downloading PPT images and ppt')
	for _, row in top_results.head(num_results).iterrows():

	# Download slide image locally
	slide_image_path = download_file_from_onedrive(
	row['Thumbnail_File_Path'], row['Thumbnail_File_ID'], headers
	)

	# Download full PPT locally
	ppt_download_link = download_file_from_onedrive(
	row['Full_PPT_File_Path'], row['Full_PPT_File_ID'], headers
	)

	title = row['Suitable_Title']
	owner = row['PPT_Owner']
	category = row['Slide_Category']
	summary = row['Short_Summary']

	results.append({
	"image": slide_image_path,
	"title": title,
	"owner": owner,
	"category": category,
	"summary": summary,
	"download_link": ppt_download_link
	})
	print("downloading complete ")
	# Update visibility of rows
	visible_rows = min(len(results), num_results)
	row_updates = []
	row_updates = []
	for i in range(20):
	if i < len(results):
	result = results[i]
	row_updates.extend([
	gr.update(visible=True), # ✅ Make the row visible
	gr.update(value=result["image"], visible=True),
	gr.update(value=f"<b>Title:</b> {result['title']}<br><b>Owner:</b> {result['owner']}<br><b>Category:</b> {result['category']}", visible=True),
	gr.update(value=result["summary"], visible=True),
	gr.update(value=result["download_link"], visible=True),
	])
	else:
	row_updates.extend([gr.update(visible=False)] * 5) # row + 4 components


	return row_updates