File size: 3,312 Bytes
a78722c
 
 
 
4d6a1a6
 
a78722c
 
 
1dc72a3
4d6a1a6
3a5932c
cf2e27f
4d6a1a6
b4ecdbb
a78722c
 
 
4d6a1a6
 
a78722c
 
 
 
cf2e27f
 
 
 
a78722c
4d6a1a6
 
 
d809e9e
 
 
 
4d6a1a6
d809e9e
 
4d6a1a6
 
 
 
1dc72a3
4d6a1a6
 
 
 
 
a78722c
 
4d6a1a6
 
1dc72a3
 
 
 
 
 
 
 
 
a78722c
ea0eba0
1dc72a3
a78722c
2c7dad0
4d6a1a6
 
2c7dad0
 
a78722c
4d6a1a6
 
a78722c
 
 
 
1dc72a3
 
a78722c
4d6a1a6
 
a78722c
 
4d6a1a6
a78722c
1dc72a3
a78722c
 
4d6a1a6
a78722c
 
1dc72a3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import streamlit as st
import pickle
import time
import requests
from bs4 import BeautifulSoup
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.schema import Document
import os


st.title("RockyBot: News Research Tool πŸ“ˆ")
st.sidebar.title("News Article URLs")

# Collect URLs from user input
urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]
process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"

main_placeholder = st.empty()
llm = ChatGroq(
    api_key=os.environ["GROQ_API_KEY"],  # This will raise an error if unset
    model_name="llama3-70b-8192"
)

def fetch_web_content(url):
    """Fetches text content from a given URL using BeautifulSoup."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()
    except Exception as e:
        return f"Error fetching {url}: {str(e)}"

if process_url_clicked:
    main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
    
    # Fetch content from URLs
    data = [(url, fetch_web_content(url)) for url in urls if url.strip()]
    
    main_placeholder.text("Data Loading...Completed...βœ…βœ…βœ…")
    
    # Split data into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    main_placeholder.text("Text Splitting...Started...βœ…βœ…βœ…")

    docs = []
    for url, text in data:
        split_docs = text_splitter.split_text(text)
        docs.extend([Document(page_content=chunk, metadata={"source": url}) for chunk in split_docs])

    main_placeholder.text("Text Splitting...Completed...βœ…βœ…βœ…")

    # Create embeddings and save to FAISS vector store
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
    
    main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
    time.sleep(2)
    
    # Save the vector store to a pickle file
    with open(file_path, "wb") as f:
        pickle.dump(vectorstore_huggingface, f)

# User query input
query = st.text_input("Question: ")
if query:
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            vectorstore = pickle.load(f)
            retriever = vectorstore.as_retriever()
            chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
            result = chain({"question": query}, return_only_outputs=True)
            
            # Display answer
            st.header("Answer")
            st.write(result["answer"])
            
            # Display sources, if available
            sources = result.get("sources", "").strip()
            if sources:
                st.subheader("Sources:")
                sources_list = sources.split("\n")
                for source in sources_list:
                    st.write(source)
            else:
                st.write("No sources found.")