ghana-helper / app.py
poemsforaphrodite's picture
Update app.py
81c2749 verified
raw
history blame
3.39 kB
import gradio as gr
import PyPDF2
import io
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import uuid
import re
# Load environment variables from .env file
load_dotenv()
# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
INDEX_NAME = "ghana"
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIMENSION = 3072
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
# Check if the index exists
if INDEX_NAME not in pc.list_indexes().names():
# Create the index with updated dimensions
pc.create_index(
name=INDEX_NAME,
dimension=EMBEDDING_DIMENSION,
metric="cosine",
spec=ServerlessSpec(
cloud=PINECONE_ENVIRONMENT.split('-')[0], # Assuming environment is in format 'gcp-starter'
region=PINECONE_ENVIRONMENT.split('-')[1]
)
)
else:
# Optionally, verify the existing index's dimension matches
existing_index = pc.describe_index(INDEX_NAME)
if existing_index.dimension != EMBEDDING_DIMENSION:
raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
# Connect to the Pinecone index
index = pc.Index(INDEX_NAME)
def transcribe_pdf(pdf_file):
# Read PDF and extract text
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
text = ""
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
# Dynamic Chunking
chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
# Generate embeddings for each chunk
embeddings = get_embeddings(chunks)
# Prepare upsert data
upsert_data = [
(str(uuid.uuid4()), emb, {"text": chunk})
for chunk, emb in zip(chunks, embeddings)
]
# Upsert to Pinecone
index.upsert(vectors=upsert_data)
return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
def dynamic_chunking(text, max_tokens=500, overlap=50):
"""
Splits text into chunks with a maximum number of tokens and a specified overlap.
"""
# Simple tokenization based on whitespace
tokens = re.findall(r'\S+', text)
chunks = []
start = 0
while start < len(tokens):
end = start + max_tokens
chunk = ' '.join(tokens[start:end])
chunks.append(chunk)
start += max_tokens - overlap
return chunks
def get_embeddings(chunks):
"""
Generates embeddings for each chunk using OpenAI's embedding API.
"""
response = client.embeddings.create(
input=chunks,
model=EMBEDDING_MODEL
)
embeddings = [data.embedding for data in response.data]
return embeddings
iface = gr.Interface(
fn=transcribe_pdf,
inputs=gr.File(label="Upload PDF", type="binary"),
outputs=gr.Textbox(label="Transcription"),
title="PDF Transcription and Upsert to Pinecone",
description="Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'."
)
if __name__ == "__main__":
iface.launch()