Spaces:

karwanjiru
/

VoyageV2

Sleeping

App Files Files Community

VoyageV2 / store_index.py

karwanjiru

Uploaded files

11916f0 5 months ago

raw

history blame contribute delete

3.5 kB

	import os
	import time
	from src.helper import PINECONE_API_KEY, text_split, download_hugging_face_embeddings
	from langchain.vectorstores import Pinecone as LangchainPinecone # Alias to avoid confusion
	from dotenv import load_dotenv
	from pinecone import Pinecone, ServerlessSpec
	from langchain_pinecone import PineconeVectorStore
	from PyPDF2 import PdfReader

	# Define the load_pdf function
	def load_pdf(file_path):
	all_text = ""
	with open(file_path, 'rb') as file:
	reader = PdfReader(file)
	for page in reader.pages:
	all_text += page.extract_text() + "\n"
	return all_text if all_text else None

	# Define the text_split function
	def text_split(text):
	from langchain.text_splitter import CharacterTextSplitter
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
	return text_splitter.split_text(text)

	# Load environment variables if not already set
	load_dotenv()

	# Load and process data
	pdf_file_path = "data/Okelloetal.2008TourismanalysisManka.pdf" # Update this path to your single PDF file
	extracted_data = load_pdf(pdf_file_path)
	if extracted_data is None:
	raise ValueError("The extracted data is None. Please check the load_pdf function.")

	print(f"Extracted Data: {extracted_data}")

	# Split the extracted text into chunks
	text_chunks = text_split(extracted_data)
	if text_chunks is None:
	raise ValueError("The text_chunks is None. Please check the text_split function.")

	print(f"Text Chunks: {text_chunks}")

	embeddings = download_hugging_face_embeddings()
	if embeddings is None:
	raise ValueError("The embeddings is None. Please check the download_hugging_face_embeddings function.")

	print(f"Embeddings: {embeddings}")

	# Ensure Pinecone API key is available
	api_key = os.environ.get("PINECONE_API_KEY")
	if not api_key:
	raise ValueError("PINECONE_API_KEY environment variable not set.")

	# Initialize Pinecone client
	pc = Pinecone(api_key=api_key)

	# Specify cloud and region for the serverless index
	cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
	region = os.environ.get('PINECONE_REGION') or 'us-east-1'
	spec = ServerlessSpec(cloud=cloud, region=region)

	# Define the index name
	index_name = "healthbot"

	# Create the index if it does not exist
	if index_name not in pc.list_indexes().names():
	pc.create_index(
	name=index_name,
	dimension=384,
	metric="cosine",
	spec=spec
	)
	# Wait for the index to be ready
	while not pc.describe_index(index_name).status['ready']:
	time.sleep(1)

	# Connect to the created index
	index = pc.Index(index_name)
	time.sleep(1)

	# Example: Add data to the index with reduced metadata
	# Create a dictionary to simulate external storage of text chunks
	text_chunk_store = {}

	# Function to simulate storing text chunk and returning a reference ID
	def store_text_chunk(text_chunk):
	chunk_id = f"chunk_{len(text_chunk_store)}"
	text_chunk_store[chunk_id] = text_chunk
	return chunk_id

	# Add text chunks to Pinecone with reference IDs
	for i, text_chunk in enumerate(text_chunks):
	chunk_id = store_text_chunk(text_chunk)
	embedding = embeddings.embed_query(text_chunk) # Embed the text chunk
	index.upsert(
	vectors=[
	{
	"id": f"vec_{i}",
	"values": embedding,
	"metadata": {"chunk_id": chunk_id} # Only store the reference ID as metadata
	}
	],
	namespace="ns1"
	)

	print("Indexing completed successfully.")