Spaces:

poemsforaphrodite
/

ghana-helper

Sleeping

App Files Files Community

ghana-helper / app.py

poemsforaphrodite

Update app.py

81c2749 verified 4 months ago

raw

history blame

3.39 kB

	import gradio as gr
	import PyPDF2
	import io
	import os
	from dotenv import load_dotenv
	from pinecone import Pinecone, ServerlessSpec
	from openai import OpenAI
	import uuid
	import re

	# Load environment variables from .env file
	load_dotenv()

	# Initialize OpenAI client
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	# Initialize Pinecone
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
	PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
	INDEX_NAME = "ghana"
	EMBEDDING_MODEL = "text-embedding-3-large"
	EMBEDDING_DIMENSION = 3072

	# Initialize Pinecone
	pc = Pinecone(api_key=PINECONE_API_KEY)

	# Check if the index exists
	if INDEX_NAME not in pc.list_indexes().names():
	# Create the index with updated dimensions
	pc.create_index(
	name=INDEX_NAME,
	dimension=EMBEDDING_DIMENSION,
	metric="cosine",
	spec=ServerlessSpec(
	cloud=PINECONE_ENVIRONMENT.split('-')[0], # Assuming environment is in format 'gcp-starter'
	region=PINECONE_ENVIRONMENT.split('-')[1]
	)
	)
	else:
	# Optionally, verify the existing index's dimension matches
	existing_index = pc.describe_index(INDEX_NAME)
	if existing_index.dimension != EMBEDDING_DIMENSION:
	raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")

	# Connect to the Pinecone index
	index = pc.Index(INDEX_NAME)

	def transcribe_pdf(pdf_file):
	# Read PDF and extract text
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
	text = ""
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	# Dynamic Chunking
	chunks = dynamic_chunking(text, max_tokens=500, overlap=50)

	# Generate embeddings for each chunk
	embeddings = get_embeddings(chunks)

	# Prepare upsert data
	upsert_data = [
	(str(uuid.uuid4()), emb, {"text": chunk})
	for chunk, emb in zip(chunks, embeddings)
	]

	# Upsert to Pinecone
	index.upsert(vectors=upsert_data)

	return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."

	def dynamic_chunking(text, max_tokens=500, overlap=50):
	"""
	Splits text into chunks with a maximum number of tokens and a specified overlap.
	"""
	# Simple tokenization based on whitespace
	tokens = re.findall(r'\S+', text)
	chunks = []
	start = 0
	while start < len(tokens):
	end = start + max_tokens
	chunk = ' '.join(tokens[start:end])
	chunks.append(chunk)
	start += max_tokens - overlap
	return chunks

	def get_embeddings(chunks):
	"""
	Generates embeddings for each chunk using OpenAI's embedding API.
	"""
	response = client.embeddings.create(
	input=chunks,
	model=EMBEDDING_MODEL
	)
	embeddings = [data.embedding for data in response.data]
	return embeddings

	iface = gr.Interface(
	fn=transcribe_pdf,
	inputs=gr.File(label="Upload PDF", type="binary"),
	outputs=gr.Textbox(label="Transcription"),
	title="PDF Transcription and Upsert to Pinecone",
	description="Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'."
	)

	if __name__ == "__main__":
	iface.launch()