Spaces:
Running
Running
File size: 3,590 Bytes
6b33c23 13d8e3b 6b33c23 13d8e3b 6b33c23 13d8e3b 9a29948 13d8e3b 83aaa54 cb56324 13d8e3b 2b2f2f5 de2c24b 24a6666 4e48e34 cb56324 13d8e3b fe46ac0 13d8e3b 79a6421 13d8e3b cb56324 13d8e3b 79a6421 13d8e3b 79a6421 13d8e3b 6b33c23 13d8e3b 6b33c23 13d8e3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import streamlit as st
from langchain_community.document_loaders import PyPDFDirectoryLoader
from pypdf import PdfReader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from pinecone import Pinecone as PineconeClient
from langchain.chains.question_answering import load_qa_chain
from datetime import datetime
from langchain_community.vectorstores import Pinecone
import os
import time
def get_pdf_text(pdf_doc):
text = ""
pdf_reader = PdfReader(pdf_doc)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def create_docs(user_pdf_list, unique_id):
docs=[]
for filename in user_pdf_list:
chunks = get_pdf_text(filename)
docs.append(Document(
page_content = chunks,
metadata = {"name":filename.name, "type=": filename.type, "size": filename.size, "unique_id": unique_id, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
))
return docs
# transform documents
def split_docs(documents, chunk_size=400, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
def get_embeddings():
embedding = OpenAIEmbeddings()
return embedding
def push_to_pinecone(docs, embedding,namespace):
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
index_name=os.environ.get("PINECONE_INDEX_NAME")
index = pc.Index(index_name)
index_dict = index.describe_index_stats()
namespace_list = list(index_dict["namespaces"].keys())
if "rag_bot" in namespace_list:
index.delete(delete_all=True, namespace='rag_bot')
else:
pass
vector = []
for i, doc in enumerate(docs):
entry = { "id": str(i),
"values": embedding.embed_query(doc.page_content),
"metadata":doc.metadata}
vector.append(entry)
with st.expander("click here:"):
type(docs)
for doc in docs:
st.write(doc)
st.write(docs)
index = Pinecone.from_documents(docs, embedding, index_name = index_name, namespace='rag_bot')
st.sidebar.write("This 35 seconds delay was added Manually... \n(because I'm using some free resources)")
time.sleep(35)
return index
#Function to pull index data from Pinecone
def pull_from_pinecone(embeddings):
pinecone_apikey = os.environ.get("PINECONE_API_KEY")
pinecone_index_name =os.environ.get("PINECONE_INDEX_NAME")
PineconeClient(
api_key=pinecone_apikey
)
#PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
index = Pinecone.from_existing_index(pinecone_index_name, embeddings, namespace='rag_bot')
return index
def get_similar_doc(query, embedding, k=2):
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
index_name=os.environ.get("PINECONE_INDEX_NAME")
index = pc.Index(index_name)
index = pull_from_pinecone(embeddings=embedding)
similar_doc = index.similarity_search_with_score(query, int(k))
return [doc for doc, similarity_score in similar_doc]
def get_answer(query, embedding, k=2):
llm=ChatOpenAI(temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")
relevent_doc = get_similar_doc(query, embedding,k=2)
response = chain.run(input_documents = relevent_doc, question=query)
return response
|