Spaces:

johnmuchiri
/

anspro1

Sleeping

App Files Files Community

anspro1 / app.py

johnmuchiri

Update app.py

ad29e3b about 1 year ago

raw

history blame contribute delete

No virus

4.43 kB

	import streamlit as st
	import os
	from langchain.chains import RetrievalQA
	from langchain.vectorstores import Chroma, Pinecone
	from langchain.llms import OpenAI
	from langchain.document_loaders import TextLoader
	from langchain.document_loaders import PyPDFLoader
	from langchain.indexes import VectorstoreIndexCreator
	from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
	import pinecone

	# Set the path where you want to save the uploaded PDF file
	SAVE_DIR = "pdf"


	st.header('Question Answering with your PDF file')
	st.write("Are you interested in chatting with your own documents, whether it is a text file, a PDF, or a website? LangChain makes it easy for you to do question answering with your documents.")
	def qa(file, query, chain_type, k,api_key_pinecode,index_name,environment_pinecode):
	# load document
	loader = PyPDFLoader(file)
	#loader = UnstructuredPDFLoader(file)
	#loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")
	documents = loader.load()
	#print("doccs",documents)
	# split the documents into chunks
	# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
	# texts = text_splitter.split_documents(documents)

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
	texts = text_splitter.split_documents(documents)
	# select which embeddings we want to use
	embeddings = OpenAIEmbeddings()
	# create the vectorestore to use as the index
	# initialize pinecone
	pinecone.init(
	api_key=api_key_pinecode, # find at app.pinecone.io
	environment=environment_pinecode #"northamerica-northeast1-gcp" # next to api key in console
	)

	#index_name = "openaiindex"
	index_name = index_name
	#db = Chroma.from_documents(texts, embeddings)
	#db = Pinecone.from_texts(texts, embeddings)
	db = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)
	# expose this index in a retriever interface
	retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
	# create a chain to answer questions
	qa = RetrievalQA.from_chain_type(
	llm=OpenAI(), chain_type=chain_type, retriever=retriever, return_source_documents=True)
	result = qa({"query": query})
	print(result['result'])
	return result









	with st.sidebar:
	st.header('Configurations')
	st.write("Enter OpenAI API key. This costs $. Set up billing at [OpenAI](https://platform.openai.com/account).")
	apikey = st.text_input("Enter your OpenAI API Key here")
	os.environ["OPENAI_API_KEY"] = apikey

	st.write("Enter Pinecode API key. [Pinecode](https://www.pinecone.io/).")

	apikey2 = st.text_input("Enter your Pinecone Key here")

	enviroment_pinecode = st.text_input("Enter your Pinecone your environment Key")

	index_name = st.text_input("enter index-name")





	left_column, right_column = st.columns(2)
	# You can use a column just like st.sidebar:








	with left_column:

	# Add a file uploader to the app
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	# Check if a file has been uploaded
	if uploaded_file is not None:
	# Save the uploaded file to the specified directory
	file_path = os.path.join(SAVE_DIR, uploaded_file.name)
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	st.success(f"File path {file_path}")
	query = st.text_input("enter your question")
	chain_type = st.selectbox(
	'chain type',
	('stuff', 'map_reduce', "refine", "map_rerank"))
	k = st.slider('Number of relevant chunks', 1, 5)

	if st.button('Loading'):
	# Or even better, call Streamlit functions inside a "with" block:
	result=qa(file_path, query, chain_type, k, apikey2, index_name, enviroment_pinecode)




	with right_column:

	st.write("Output of your question")

	#st.write(result)

	#st.write(result['result'])
	st.subheader("Result")
	st.write(result['result'])

	st.subheader("source_documents")
	st.write(result['source_documents'][0])