Spaces:
Runtime error
Runtime error
File size: 3,697 Bytes
0ed03bb 765e236 0ed03bb b21809b 0ed03bb d05386b 0ed03bb d05386b 0ed03bb d05386b b21809b d05386b 0ed03bb d05386b 0ed03bb d05386b 0ed03bb d05386b b21809b d05386b a772baa d05386b b21809b d05386b 765e236 b21809b d05386b 765e236 b21809b d05386b 765e236 b21809b d05386b b21809b d05386b 0ed03bb d05386b b21809b 0ed03bb b21809b d05386b eab485b b21809b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import streamlit as st
import pickle
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from dotenv import load_dotenv
# Load data from URLs using the UnstructuredURLLoader
def load_data(urls):
loader = UnstructuredURLLoader(urls=urls)
return loader.load()
# Split data into manageable chunks for processing
def split_data(data):
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000,
chunk_overlap=100)
return text_splitter.split_documents(data)
# Generate embeddings for the individual data chunks
def embed_data(individual_chunks):
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
return FAISS.from_documents(individual_chunks, embeddings)
# Save the FAISS index to a file for later retrieval
def save_faiss_index(file_path, vector_data):
with open(file_path, "wb") as fp:
pickle.dump(vector_data, fp)
# Load the FAISS index from the file
def load_faiss_index(file_path):
with open(file_path, 'rb') as fp:
return pickle.load(fp)
# Create a retrieval chain for question-answering using the vector store
def retrieval_chain(llm, vector_store):
return RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_store.as_retriever())
# Use the retrieval chain to find and return an answer to a question, along with sources
def find_answer(retrieval_chain, question):
return retrieval_chain({"question": question}) # Removed return_only_outputs=True
def main():
load_dotenv()
# Set up the Streamlit interface
st.markdown("## ArticleIQ - Smart News Research Assistant π")
# To collect URLs from user input, increase the range as needed if more are required.
st.sidebar.title("Articles URLs π")
urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]
activate_articleiq = st.sidebar.button("Activate ArticleIQ")
status_display = st.empty()
file_path = 'FAISS_Vector_Data.pkl'
llm = OpenAI(model='gpt-3.5-turbo-instruct',temperature=0.5, max_tokens=500)
# If the button is clicked, start processing the URLs
if activate_articleiq:
data = load_data(urls)
status_display.text('Loading Data β³')
individual_chunks = split_data(data)
status_display.text('Splitting Data βοΈ')
vector_data = embed_data(individual_chunks)
status_display.text('Embedding Vectors π₯π€')
save_faiss_index(file_path, vector_data)
# Allow the user to enter a question and get an answer
question = status_display.text_input('Question: ')
if question:
if os.path.exists(file_path):
vector_store = load_faiss_index(file_path)
retrieval_chain_obj = retrieval_chain(llm, vector_store)
final_output = find_answer(retrieval_chain_obj, question)
st.header("IQ's Answer")
st.write(final_output["answer"])
# Display the sources for further reading
sources = final_output.get("sources", '')
if sources:
st.subheader("Further reading:")
sources_str = sources.split("\n")
for source in sources_str:
st.write(source)
if __name__ == "__main__":
main() |