Spaces:

NCTCMumbai
/

NCTC_SlideFinder

Running

File size: 4,190 Bytes

9198e15

#TAB : VIEW PPT.py
import requests
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import os
import shutil
import gradio as gr
# Local cache directory for downloaded files
LOCAL_CACHE_DIR = "local_cache"
os.makedirs(LOCAL_CACHE_DIR, exist_ok=True)

# Function to download a file from OneDrive to the local cache
def download_file_from_onedrive(file_path, file_id, headers):
    local_file_path = os.path.join(LOCAL_CACHE_DIR, os.path.basename(file_path))

    if not os.path.exists(local_file_path):  # Avoid re-downloading
        download_url = f"https://graph.microsoft.com/v1.0/me/drive/items/{file_id}/content"
        response = requests.get(download_url, headers=headers)

        if response.status_code != 200:
            raise ValueError(f"Failed to download file {file_path}. Error: {response.text}")

        with open(local_file_path, "wb") as f:
            f.write(response.content)

        print(f"✅ Downloaded: {file_path} -> {local_file_path}")

    return local_file_path

# Function to search PPTs
def search_ppts(query, num_results):
   
    global df
    gr.Info("Searching the relevant PPTs .")
    # Generate query embedding
    query_embedding = embedding_model.encode(query).tolist()
    # Filter the DataFrame to include only rows where Unique_Slide_ID ends with "slide_1"
    df1 = df[df['Unique_Slide_ID'].str.endswith("slide_1", na=False)]
    # Compute cosine similarity scores
    df1['similarity'] = df1['Short_Summary_Embedding'].apply(
        lambda x: cosine_similarity([query_embedding], [eval(x)])[0][0]
    )

    # Sort by cosine similarity score
    df1 = df1.sort_values(by='similarity', ascending=False)

    # Get top N results for reranking
    top_n = min(50, len(df1))  # Take top 50 results for reranking
    top_results = df1.head(top_n)

    # Prepare input pairs for cross-encoder reranking
    pairs = [(query, row['Short_Summary']) for _, row in top_results.iterrows()]

    # Rerank using cross-encoder
    gr.Info("Doing Semantic Reranking for most appropriate results ")
    rerank_scores = cross_encoder.predict(pairs)
    top_results = top_results.copy()  # Avoid SettingWithCopyWarning
    top_results['rerank_score'] = rerank_scores

    # Sort by rerank score
    top_results = top_results.sort_values(by='rerank_score', ascending=False)
    print(top_results)
    # Prepare results
    results = []
    gr.Info('Downloading PPT images and ppt')
    print('Downloading PPT images and ppt')
    for _, row in top_results.head(num_results).iterrows():

        # Download slide image locally
        slide_image_path = download_file_from_onedrive(
            row['Thumbnail_File_Path'], row['Thumbnail_File_ID'], headers
        )

        # Download full PPT locally
        ppt_download_link = download_file_from_onedrive(
            row['Full_PPT_File_Path'], row['Full_PPT_File_ID'], headers
        )

        title = row['Suitable_Title']
        owner = row['PPT_Owner']
        category = row['Slide_Category']
        summary = row['Short_Summary']

        results.append({
            "image": slide_image_path,
            "title": title,
            "owner": owner,
            "category": category,
            "summary": summary,
            "download_link": ppt_download_link
        })
    print("downloading complete ")
    # Update visibility of rows
    visible_rows = min(len(results), num_results)
    row_updates = []
    row_updates = []
    for i in range(20):
        if i < len(results):
            result = results[i]
            row_updates.extend([
                gr.update(visible=True),  # ✅ Make the row visible
                gr.update(value=result["image"], visible=True),
                gr.update(value=f"<b>Title:</b> {result['title']}<br><b>Owner:</b> {result['owner']}<br><b>Category:</b> {result['category']}", visible=True),
                gr.update(value=result["summary"], visible=True),
                gr.update(value=result["download_link"], visible=True),
            ])
        else:
            row_updates.extend([gr.update(visible=False)] * 5)  # row + 4 components


    return row_updates