import openai from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Pinecone from langchain.llms import OpenAI from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.schema import Document import pinecone from langchain.vectorstores import FAISS from pypdf import PdfReader from langchain.llms.openai import OpenAI from langchain.chains.summarize import load_summarize_chain from langchain import HuggingFaceHub from langchain.document_loaders import DirectoryLoader #Extract Information from PDF file def get_pdf_text(pdf_doc): text = "" pdf_reader = PdfReader(pdf_doc) for page in pdf_reader.pages: text += page.extract_text() return text # iterate over files in # that user uploaded PDF files, one by one def create_docs(user_pdf_list, unique_id): docs=[] for filename in user_pdf_list: chunks=get_pdf_text(filename) #Adding items to our list - Adding data & its metadata docs.append(Document( page_content=chunks, metadata={"name": filename.name,"id":filename.id,"type=":filename.type,"size":filename.size,"unique_id":unique_id}, )) # Load Files from Directory (Local Version) #loader = DirectoryLoader('./Repository', glob='**/*') #docs1 = loader.load() #final_docs = docs + docs1 return docs #Create embeddings instance def create_embeddings_load_data(): embeddings = OpenAIEmbeddings() #embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") return embeddings def close_matches(query,k,docs,embeddings): #https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.faiss.FAISS.html#langchain.vectorstores.faiss.FAISS.similarity_search_with_score db = FAISS.from_documents(docs, embeddings) similar_docs = db.similarity_search_with_score(query, int(k)) return similar_docs # Helps us get the summary of a document def get_summary(current_doc): llm = OpenAI(temperature=0) #llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10}) chain = load_summarize_chain(llm, chain_type="map_reduce") summary = chain.run([current_doc]) return summary