Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import pickle | |
import time | |
from langchain.chains import RetrievalQA | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import UnstructuredURLLoader | |
#from langchain.vectorstores import FAISS | |
from langchain_community.vectorstores import FAISS | |
from langchain_huggingface import HuggingFaceEndpoint | |
from sentence_transformers import SentenceTransformer | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain import HuggingFaceHub | |
from dotenv import load_dotenv | |
load_dotenv() | |
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"#"mistralai/Mistral-7B-Instruct-v0.3" | |
llm = HuggingFaceHub( | |
repo_id=repo_id, | |
task="text-generation", | |
huggingfacehub_api_token=os.getenv("HF_TOKEN_FOR_WEBSEARCH"), | |
model_kwargs={"temperature": 0.6, | |
"max_tokens":1000} | |
) | |
st.title("LinkWise π") | |
st.sidebar.title("Article URLs") | |
# Initialize session state to store the number of URL inputs | |
if 'url_count' not in st.session_state: | |
st.session_state.url_count = 1 # Start with 3 URL placeholders | |
# Function to add a new URL input | |
def add_url(): | |
st.session_state.url_count += 1 | |
# List to store the URLs | |
urls = [] | |
# Create the URL input fields dynamically | |
for i in range(st.session_state.url_count): | |
url = st.sidebar.text_input(f"URL {i+1}") | |
urls.append(url) | |
# Add a button to increase the number of URLs | |
st.sidebar.button("Add another URL", on_click=add_url) | |
process_url_clicked=st.sidebar.button("Submit URLs") | |
# urls=[] | |
# for i in range(3): | |
# url=st.sidebar.text_input(f"URL {i+1}") | |
# urls.append(url) | |
# process_url_clicked=st.sidebar.button("Process URLs") | |
file_path="faiss_store_db.pkl" | |
placeholder=st.empty() | |
if process_url_clicked: | |
#Loading the data | |
loader=UnstructuredURLLoader(urls=urls) | |
placeholder.text("Data Loading started...") | |
data=loader.load() | |
#Splitting the data | |
text_splitter=RecursiveCharacterTextSplitter( | |
separators=['\n\n','\n','.','.'], | |
chunk_size=600, | |
chunk_overlap=100 | |
) | |
placeholder.text("Splitting of Data Started...") | |
docs=text_splitter.split_documents(data) | |
#creating embeddings | |
model_name = "sentence-transformers/all-mpnet-base-v2" #"sentence-transformers/all-MiniLM-L6-v2" | |
hf_embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
vector_index=FAISS.from_documents(docs,hf_embeddings) | |
placeholder.text("Started Building Embedded Vector...") | |
#saving in FAISS store | |
with open(file_path,'wb') as f: | |
pickle.dump(vector_index,f) | |
query=placeholder.text_input("Question :") | |
submit=st.button("Submit") | |
if query: | |
if os.path.exists(file_path): | |
with open(file_path,'rb') as f: | |
vector_index=pickle.load(f) | |
retrieval_qa = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", # You can use 'stuff', 'map_reduce', or 'refine' depending on your use case | |
retriever=vector_index.as_retriever() | |
) | |
result=retrieval_qa({'query':query}) | |
text=result['result'] | |
start_index = text.find("\nHelpful Answer:") | |
# Extract everything after "\nHelpful Answer:" if it exists | |
if start_index != -1: | |
parsed_text =text[start_index + len("\nHelpful Answer:"):] | |
parsed_text = parsed_text.strip() # Optionally strip any extra whitespace | |
if query or submit: | |
st.header("Answer :") | |
st.write(parsed_text) | |