DeyPoulomi's picture
Update utils.py
a890f0d verified
raw
history blame
3.48 kB
from langchain.vectorstores import Pinecone
#from langchain.llms import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
import pinecone
import os
from pypdf import PdfReader
#from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import HuggingFaceHub
from langchain.llms import CTransformers
import time
#embedding_model_name = os.environ.get('sentence-transformers/all-MiniLM-L6-v2')
#Extract Information from PDF file
def get_pdf_text(pdf_doc):
text = ""
pdf_reader = PdfReader(pdf_doc)
for page in pdf_reader.pages:
text += page.extract_text()
return text
# iterate over files in
# that user uploaded PDF files, one by one
def create_docs(user_pdf_list, unique_id):
docs=[]
for filename in user_pdf_list:
chunks=get_pdf_text(filename)
#Adding items to our list - Adding data & its metadata
docs.append(Document(
page_content=chunks,
metadata={"name": filename.name,"id":filename.file_id,"type=":filename.type,"size":filename.size,"unique_id":unique_id},
))
return docs
#Create embeddings instance
def create_embeddings_load_data():
#embeddings = OpenAIEmbeddings()
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
return embeddings
#Function to push data to Vector Store - Pinecone here
def push_to_pinecone(PINECONE_PROJECT_ID,PINECONE_REGION,pinecone_index_name,embeddings,docs):
pinecone.init(
api_key=PINECONE_PROJECT_ID,
environment=PINECONE_REGION
)
Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
#Function to pull infrmation from Vector Store - Pinecone here
def pull_from_pinecone(PINECONE_PROJECT_ID,PINECONE_REGION,pinecone_index_name,embeddings):
# For some of the regions allocated in pinecone which are on free tier, the data takes upto 10secs for it to available for filtering
#so I have introduced 20secs here, if its working for you without this delay, you can remove it :)
#https://docs.pinecone.io/docs/starter-environment
print("20secs delay...")
time.sleep(20)
pinecone.init(
api_key=PINECONE_PROJECT_ID,
environment=PINECONE_REGION
)
index_name = pinecone_index_name
index = Pinecone.from_existing_index(index_name, embeddings)
return index
#Function to help us get relavant documents from vector store - based on user input
def similar_docs(query,k,PINECONE_PROJECT_ID,PINECONE_REGION,pinecone_index_name,embeddings,unique_id):
pinecone.init(
api_key=PINECONE_PROJECT_ID,
environment=PINECONE_REGION
)
index_name = pinecone_index_name
index = pull_from_pinecone(PINECONE_PROJECT_ID,PINECONE_REGION,index_name,embeddings)
similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
#print(similar_docs)
return similar_docs
# Helps us get the summary of a document
def get_summary(current_doc):
#llm = OpenAI(temperature=0)
llm = CTransformers(model="TheBloke/Llama-2-7B-Chat-GGML",model_type='llama',config={'max_new_tokens': 4000,'context_length': 2048, 'temperature':0.01})
#)
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.run([current_doc])
return summary