Spaces:
Sleeping
Sleeping
File size: 3,289 Bytes
6b33c23 13d8e3b 6b33c23 13d8e3b 6b33c23 13d8e3b 6b33c23 13d8e3b 6b33c23 13d8e3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import streamlit as st
from langchain_community.document_loaders import PyPDFDirectoryLoader
from pypdf import PdfReader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from pinecone import Pinecone as PineconeClient
from langchain.chains.question_answering import load_qa_chain
from datetime import datetime
from langchain_community.vectorstores import Pinecone
import os
import time
def get_pdf_text(pdf_doc):
text = ""
pdf_reader = PdfReader(pdf_doc)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def create_docs(user_pdf_list, unique_id):
docs=[]
for filename in user_pdf_list:
chunks = get_pdf_text(filename)
docs.append(Document(
page_content = chunks,
metadata = {"name":filename.name, "type=": filename.type, "size": filename.size, "unique_id": unique_id, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
))
return docs
# transform documents
def split_docs(documents, chunk_size=400, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
def get_embeddings():
embedding = OpenAIEmbeddings()
return embedding
def push_to_pinecone(docs, embedding):
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
index_name=os.environ.get("PINECONE_INDEX_NAME")
index = pc.Index(index_name)
index.delete(delete_all=True, namespace='rag_bot')
vector = []
for i, doc in enumerate(docs):
entry = { "id": str(i),
"values": embedding.embed_query(doc.page_content),
"metadata":doc.metadata}
vector.append(entry)
index = Pinecone.from_documents(docs, embedding, index_name = index_name, namespace='rag_bot')
st.sidebar.write("This 30 seconds delay is added Manually... \n(because I'm using some free resources)")
time.sleep(30)
return index
#Function to pull index data from Pinecone
def pull_from_pinecone(embeddings):
pinecone_apikey = os.environ.get("PINECONE_API_KEY")
pinecone_index_name =os.environ.get("PINECONE_INDEX_NAME")
PineconeClient(
api_key=pinecone_apikey
)
#PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
index = Pinecone.from_existing_index(pinecone_index_name, embeddings, namespace='rag_bot')
return index
def get_similar_doc(query, embedding,k=2):
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
index_name=os.environ.get("PINECONE_INDEX_NAME")
index = pc.Index(index_name)
index = pull_from_pinecone(embeddings=embedding)
similar_doc = index.similarity_search_with_score(query, int(k))
return [doc for doc, similarity_score in similar_doc]
def get_answer(query, embedding, k=2):
llm=ChatOpenAI(temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")
relevent_doc = get_similar_doc(query, embedding,k=2)
response = chain.run(input_documents = relevent_doc, question=query)
return response
|