medicalchatbot / store_index.py
smartgreendeer's picture
Upload 20 files
01752fc verified
import os
import random
from src.helper import load_pdf, text_split, download_hugging_face_embeddings
from langchain.vectorstores import Pinecone
import pinecone
from dotenv import load_dotenv
extracted_data = load_pdf("data/")
text_chunks = text_split(extracted_data)
embeddings = download_hugging_face_embeddings()
if not os.environ.get("PINECONE_API_KEY"):
from pinecone_notebooks import authenticate
authenticate()
api_key = os.environ.get("PINECONE_API_KEY")
from pinecone import Pinecone
pc = Pinecone(api_key=api_key)
from pinecone import ServerlessSpec
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
index_name = "medical-chatbot"
import time
import random
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=384,
metric="cosine",
spec=spec
)
# wait for index to be ready
while not pc.describe_index(index_name).status['ready']:
time.sleep(1)
index = pc.Index(index_name)
time.sleep(1)
# Generate random 384-dimensional vectors
def generate_random_vector(dim):
return [random.uniform(-1, 1) for _ in range(dim)]
upsert1 = index.upsert(
vectors=[
{"id": "vec1", "values": generate_random_vector(384)},
{"id": "vec2", "values": generate_random_vector(384)},
{"id": "vec3", "values": generate_random_vector(384)},
],
namespace="ns1"
)
print(upsert1)
upsert2 = index.upsert(
vectors=[
{"id": "vec1", "values": generate_random_vector(384)},
{"id": "vec2", "values": generate_random_vector(384)},
{"id": "vec3", "values": generate_random_vector(384)},
],
namespace="ns2"
)
print(upsert2)
print(index.describe_index_stats())
query_vector_ns1 = generate_random_vector(384)
query_results1 = index.query(
namespace="ns1",
vector=query_vector_ns1,
top_k=3,
include_values=True
)
print(query_results1)
query_vector_ns2 = generate_random_vector(384)
query_results2 = index.query(
namespace="ns2",
vector=query_vector_ns2,
top_k=3,
include_values=True
)
print(query_results2)
# docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)