# Re-build 250325 import gradio as gr from transformers import AutoTokenizer, AutoModel import torch import numpy as np from PyPDF2 import PdfReader from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType import os import hashlib import time # Load NASA-specific bi-encoder model tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2") model = AutoModel.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2") # Initialize Pinecone client pinecone_api_key = os.getenv('PINECONE_API_KEY') pc = Pinecone(api_key=pinecone_api_key) # Create Pinecone index if it doesn't exist index_name = "scdd-index" if index_name not in pc.list_indexes().names(): pc.create_index( name=index_name, dimension=768, spec=ServerlessSpec( cloud=CloudProvider.AWS, region=AwsRegion.US_EAST_1 ), vector_type=VectorType.DENSE, metric="cosine" ) # Connect to the Pinecone index index = pc.Index(index_name) # Function to encode text using bi-encoder in batches def encode_chunks_batch(chunks, batch_size=8): embeddings = [] for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i+batch_size] inputs = tokenizer(batch_chunks, return_tensors='pt', padding=True, truncation=True, max_length=128) with torch.no_grad(): output = model(**inputs) batch_embeddings = output.last_hidden_state.mean(dim=1) batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True) embeddings.extend(batch_embeddings.cpu().numpy()) return embeddings # Function to generate a unique chunk ID based on file content def generate_chunk_id(pdf_file, chunk_text, chunk_idx): hasher = hashlib.md5() hasher.update(chunk_text.encode('utf-8')) file_hash = hasher.hexdigest() return f"{os.path.basename(pdf_file.name)}-{file_hash}-chunk-{chunk_idx}" # Function to process PDFs and upsert embeddings to Pinecone def process_pdfs(pdf_files): start_time = time.time() for pdf_file in pdf_files: reader = PdfReader(pdf_file.name) pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text()) # Split text into smaller chunks chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)] yield "Processing file, generating Embeddings and pushing to Pinecone...Please wait..." # Generate embeddings in batches embeddings = encode_chunks_batch(chunks, batch_size=8) # Prepare data for Pinecone with unique IDs vectors = [ (generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk}) for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks)) ] # Upsert embeddings into Pinecone index.upsert(vectors) # Fetch index stats stats = index.describe_index_stats() elapsed_time = time.time() - start_time yield f"Processed PDF and embeddings stored in Pinecone successfully in {elapsed_time:.2f} seconds. Current Index Stats: {stats}" # Gradio Interface demo = gr.Interface( fn=process_pdfs, inputs=gr.Files(label="Upload PDF", file_types=[".pdf"]), outputs="text", title="NASA Bi-encoder PDF Embedding & Pinecone Storage", description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone." ) demo.launch()