File size: 3,697 Bytes
0ed03bb
 
 
765e236
0ed03bb
 
 
b21809b
0ed03bb
 
 
d05386b
 
 
 
0ed03bb
d05386b
 
 
 
 
 
 
0ed03bb
d05386b
 
b21809b
 
d05386b
0ed03bb
d05386b
 
0ed03bb
 
 
d05386b
 
 
 
 
 
 
 
 
 
 
 
0ed03bb
d05386b
 
 
 
 
 
b21809b
d05386b
 
 
 
 
 
 
a772baa
d05386b
b21809b
d05386b
765e236
b21809b
d05386b
765e236
b21809b
d05386b
765e236
b21809b
d05386b
 
b21809b
 
 
d05386b
 
 
 
 
0ed03bb
 
d05386b
b21809b
 
 
0ed03bb
b21809b
 
 
d05386b
eab485b
b21809b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import streamlit as st
import pickle
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from dotenv import load_dotenv

# Load data from URLs using the UnstructuredURLLoader
def load_data(urls):
    loader = UnstructuredURLLoader(urls=urls)
    return loader.load()

# Split data into manageable chunks for processing
def split_data(data):
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000,
        chunk_overlap=100)
    return text_splitter.split_documents(data)

# Generate embeddings for the individual data chunks
def embed_data(individual_chunks):
    embeddings = OpenAIEmbeddings()
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    return FAISS.from_documents(individual_chunks, embeddings)

# Save the FAISS index to a file for later retrieval
def save_faiss_index(file_path, vector_data):
    with open(file_path, "wb") as fp:
        pickle.dump(vector_data, fp)

# Load the FAISS index from the file
def load_faiss_index(file_path):
    with open(file_path, 'rb') as fp:
        return pickle.load(fp)

# Create a retrieval chain for question-answering using the vector store
def retrieval_chain(llm, vector_store):
    return RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_store.as_retriever())

# Use the retrieval chain to find and return an answer to a question, along with sources
def find_answer(retrieval_chain, question):
    return retrieval_chain({"question": question})  # Removed return_only_outputs=True

def main():
    load_dotenv()
    
    # Set up the Streamlit interface
    st.markdown("## ArticleIQ - Smart News Research Assistant πŸ”")

    # To collect URLs from user input, increase the range as needed if more are required.
    st.sidebar.title("Articles URLs πŸ‘‡")
    urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]
    
    activate_articleiq = st.sidebar.button("Activate ArticleIQ")
    status_display = st.empty()
    
    file_path = 'FAISS_Vector_Data.pkl'
    llm = OpenAI(model='gpt-3.5-turbo-instruct',temperature=0.5, max_tokens=500)
    
    # If the button is clicked, start processing the URLs
    if activate_articleiq:
        data = load_data(urls)
        status_display.text('Loading Data ⏳')
        
        individual_chunks = split_data(data)
        status_display.text('Splitting Data βœ‚οΈ')
        
        vector_data = embed_data(individual_chunks)
        status_display.text('Embedding Vectors πŸ“₯πŸ“€')
        
        save_faiss_index(file_path, vector_data)
        
    # Allow the user to enter a question and get an answer
    question = status_display.text_input('Question: ')
    if question:
        if os.path.exists(file_path):
            vector_store = load_faiss_index(file_path)
            retrieval_chain_obj = retrieval_chain(llm, vector_store)
            final_output = find_answer(retrieval_chain_obj, question)
            st.header("IQ's Answer")
            st.write(final_output["answer"])
            
            # Display the sources for further reading
            sources = final_output.get("sources", '')
            if sources:
                st.subheader("Further reading:")
                sources_str = sources.split("\n") 
                for source in sources_str:
                    st.write(source)  

if __name__ == "__main__":
    main()