ishaan-mital's picture
requirement.txt
c8e6347
import gradio as gr
import os
import pinecone
import time
# from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# import PyPDF2
# import re
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
# device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
embed_model = HuggingFaceEmbeddings(
model_name=embed_model_id,
# model_kwargs={'device': device},
# encode_kwargs={'device': device, 'batch_size': 32}
)
# get API key from app.pinecone.io and environment from console
pinecone.init(
api_key=os.environ.get('PINECONE_API_KEY'),
environment=os.environ.get('PINECONE_ENVIRONMENT')
)
docs = [
"this is one document",
"and another document"
]
embeddings = embed_model.embed_documents(docs)
index_name = 'llama-rag'
if index_name not in pinecone.list_indexes():
pinecone.create_index(
index_name,
dimension=len(embeddings[0]),
metric='cosine'
)
# wait for index to finish initialization
while not pinecone.describe_index(index_name).status['ready']:
time.sleep(1)
index = pinecone.Index(index_name)
index.describe_index_stats()
# def extract_text_from_pdf(pdf_path):
# pdf_file = open(pdf_path, 'rb')
# pdf_reader = PyPDF2.PdfReader(pdf_file)
# text = ""
# for page_number in range(len(pdf_reader.pages)):
# page = pdf_reader.pages[page_number]
# text += page.extract_text()
# pdf_file.close()
# return text
# def identify_sections(text):
# # Assuming sections start with "Chapter" headings
# sections = re.split(r'\n1+', text)
# sections = [section.strip() for section in sections if section.strip()]
# return sections
# pdf_files = ['leph101.pdf', 'leph102.pdf','leph103.pdf','leph104.pdf','leph105.pdf','leph106.pdf','leph107.pdf','leph108.pdf'] # Add more file names as needed
# book_sections=[]
# for pdf_file in pdf_files:
# pdf_path = f'/content/{pdf_file}'
# book_text = extract_text_from_pdf(pdf_path)
# book_sections.append(identify_sections(book_text))
# print(len(book_sections))
# # Now you can organize and store the data as needed
# import pandas as pd
# data = pd.DataFrame({
# 'ID': range(len(book_sections)), # Sequential IDs
# 'Text': book_sections
# })
# print(data)
# batch_size = 4
# for i in range(0, len(data), batch_size):
# i_end = min(len(data), i+batch_size)
# batch = data.iloc[i:i_end]
# ids = [f"{x['ID']}" for i, x in batch.iterrows()]
# texts = [x['Text'] for i, x in batch.iterrows()]
# embeds = embed_model.embed_documents(texts)
# # get metadata to store in Pinecone
# metadata = [
# {'text': x['Text'],
# 'ID': x['ID']} for i, x in batch.iterrows()
# ]
# # add to Pinecone
# index.upsert(vectors=zip(ids, embeds,metadata))
text_field = 'text' # field in metadata that contains text content
vectorstore = Pinecone(
index, embed_model.embed_query, text_field
)
def question(query):
return vectorstore.similarity_search(
query, # the search query
k=3 # returns top 3 most relevant chunks of text
)
demo = gr.Interface(fn=question, inputs="text", outputs="text")
if __name__ == "__main__":
demo.launch()