Spaces:

keshavan
/

articleiq-smart_news_research_assistant

Runtime error

App Files Files Community

articleiq-smart_news_research_assistant / app.py

keshavan

Update app.py

a772baa verified over 1 year ago

raw

history blame contribute delete

3.7 kB

	import os
	import streamlit as st
	import pickle
	from langchain.llms import OpenAI
	from langchain.document_loaders import UnstructuredURLLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import FAISS
	from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
	from langchain.chains import RetrievalQAWithSourcesChain
	from dotenv import load_dotenv

	# Load data from URLs using the UnstructuredURLLoader
	def load_data(urls):
	loader = UnstructuredURLLoader(urls=urls)
	return loader.load()

	# Split data into manageable chunks for processing
	def split_data(data):
	text_splitter = RecursiveCharacterTextSplitter(
	separators=['\n\n', '\n', '.', ','],
	chunk_size=1000,
	chunk_overlap=100)
	return text_splitter.split_documents(data)

	# Generate embeddings for the individual data chunks
	def embed_data(individual_chunks):
	embeddings = OpenAIEmbeddings()
	# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	return FAISS.from_documents(individual_chunks, embeddings)

	# Save the FAISS index to a file for later retrieval
	def save_faiss_index(file_path, vector_data):
	with open(file_path, "wb") as fp:
	pickle.dump(vector_data, fp)

	# Load the FAISS index from the file
	def load_faiss_index(file_path):
	with open(file_path, 'rb') as fp:
	return pickle.load(fp)

	# Create a retrieval chain for question-answering using the vector store
	def retrieval_chain(llm, vector_store):
	return RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_store.as_retriever())

	# Use the retrieval chain to find and return an answer to a question, along with sources
	def find_answer(retrieval_chain, question):
	return retrieval_chain({"question": question}) # Removed return_only_outputs=True

	def main():
	load_dotenv()

	# Set up the Streamlit interface
	st.markdown("## ArticleIQ - Smart News Research Assistant 🔍")

	# To collect URLs from user input, increase the range as needed if more are required.
	st.sidebar.title("Articles URLs 👇")
	urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]

	activate_articleiq = st.sidebar.button("Activate ArticleIQ")
	status_display = st.empty()

	file_path = 'FAISS_Vector_Data.pkl'
	llm = OpenAI(model='gpt-3.5-turbo-instruct',temperature=0.5, max_tokens=500)

	# If the button is clicked, start processing the URLs
	if activate_articleiq:
	data = load_data(urls)
	status_display.text('Loading Data ⏳')

	individual_chunks = split_data(data)
	status_display.text('Splitting Data ✂️')

	vector_data = embed_data(individual_chunks)
	status_display.text('Embedding Vectors 📥📤')

	save_faiss_index(file_path, vector_data)

	# Allow the user to enter a question and get an answer
	question = status_display.text_input('Question: ')
	if question:
	if os.path.exists(file_path):
	vector_store = load_faiss_index(file_path)
	retrieval_chain_obj = retrieval_chain(llm, vector_store)
	final_output = find_answer(retrieval_chain_obj, question)
	st.header("IQ's Answer")
	st.write(final_output["answer"])

	# Display the sources for further reading
	sources = final_output.get("sources", '')
	if sources:
	st.subheader("Further reading:")
	sources_str = sources.split("\n")
	for source in sources_str:
	st.write(source)

	if __name__ == "__main__":
	main()