from langchain.vectorstores import Pinecone #from langchain.llms import OpenAI from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.schema import Document from langchain.embeddings import HuggingFaceEmbeddings import pinecone import os from pypdf import PdfReader #from langchain.llms.openai import OpenAI from langchain.chains.summarize import load_summarize_chain from langchain.llms import HuggingFaceHub from langchain.llms import CTransformers import time #embedding_model_name = os.environ.get('sentence-transformers/all-MiniLM-L6-v2') #Extract Information from PDF file def get_pdf_text(pdf_doc): text = "" pdf_reader = PdfReader(pdf_doc) for page in pdf_reader.pages: text += page.extract_text() return text # iterate over files in # that user uploaded PDF files, one by one def create_docs(user_pdf_list, unique_id): docs=[] for filename in user_pdf_list: chunks=get_pdf_text(filename) #Adding items to our list - Adding data & its metadata docs.append(Document( page_content=chunks, metadata={"name": filename.name,"id":filename.file_id,"type=":filename.type,"size":filename.size,"unique_id":unique_id}, )) return docs #Create embeddings instance def create_embeddings_load_data(): #embeddings = OpenAIEmbeddings() embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") return embeddings #Function to push data to Vector Store - Pinecone here def push_to_pinecone(PINECONE_PROJECT_ID,PINECONE_REGION,pinecone_index_name,embeddings,docs): pinecone.init( api_key=PINECONE_PROJECT_ID, environment=PINECONE_REGION ) Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name) #Function to pull infrmation from Vector Store - Pinecone here def pull_from_pinecone(PINECONE_PROJECT_ID,PINECONE_REGION,pinecone_index_name,embeddings): # For some of the regions allocated in pinecone which are on free tier, the data takes upto 10secs for it to available for filtering #so I have introduced 20secs here, if its working for you without this delay, you can remove it :) #https://docs.pinecone.io/docs/starter-environment print("20secs delay...") time.sleep(20) pinecone.init( api_key=PINECONE_PROJECT_ID, environment=PINECONE_REGION ) index_name = pinecone_index_name index = Pinecone.from_existing_index(index_name, embeddings) return index #Function to help us get relavant documents from vector store - based on user input def similar_docs(query,k,PINECONE_PROJECT_ID,PINECONE_REGION,pinecone_index_name,embeddings,unique_id): pinecone.init( api_key=PINECONE_PROJECT_ID, environment=PINECONE_REGION ) index_name = pinecone_index_name index = pull_from_pinecone(PINECONE_PROJECT_ID,PINECONE_REGION,index_name,embeddings) similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id}) #print(similar_docs) return similar_docs # Helps us get the summary of a document def get_summary(current_doc): #llm = OpenAI(temperature=0) llm = CTransformers(model="TheBloke/Llama-2-7B-Chat-GGML",model_type='llama',config={'max_new_tokens': 4000,'context_length': 2048, 'temperature':0.01}) #) chain = load_summarize_chain(llm, chain_type="map_reduce") summary = chain.run([current_doc]) return summary