Spaces:

rianders
/

mpi_data_store

Sleeping

App Files Files Community

rianders commited on Apr 12

Commit

9831243

•

1 Parent(s): 4b8c5b3

Functioning pages added

Browse files

Files changed (6) hide show

app.py +43 -38
pages/01_data_collection.py +97 -0
pages/02_data_organization.py +75 -0
pages/03_model_selection.py +50 -0
pages/04_encoding_storage.py +122 -0
pages/05_testing_qa.py +111 -0

app.py CHANGED Viewed

@@ -1,39 +1,44 @@
 import streamlit as st
-from start_page import main as start_page
-# Import other pages. Assume each has a main function to run the page.
-from pages.data_source_config import main as data_source_config
-from pages.data_loading import main as data_loading
-# Add imports for other pages similarly...
-# Initialize session state for page navigation if not already set
-if 'page' not in st.session_state:
-    st.session_state.page = 'start_page'
-# Define a function to change the page
-def change_page(page_name):
-    st.session_state.page = page_name
-# Page selection (could also use st.sidebar for these)
-st.sidebar.title("Navigation")
-st.sidebar.button("Start Page", on_click=change_page, args=('start_page',))
-st.sidebar.button("Web and File Resource Configuration", on_click=change_page, args=('file_web_source_collection',))
-st.sidebar.button("Data Source Configuration", on_click=change_page, args=('data_source_config',))
-st.sidebar.button("Data Loading", on_click=change_page, args=('data_loading',))
-# Add buttons for other pages similarly...
-# Page dispatch
-if st.session_state.page == 'start_page':
-    start_page()
-elif st.session_state.page == 'data_source_config':
-    data_source_config()
-elif st.session_state.page == 'data_loading':
-    data_loading()
-elif st.session_state.page == 'model_selection':
-    model_selection()
-elif st.session_state.page == 'processing_embedding':
-    processing_embedding()
-# The above could be optimized by mapping page names to functions

 import streamlit as st
+import os
+st.set_page_config(page_title='Knowledge Navigator', layout='wide')
+def main():
+    st.title('Knowledge Navigator')
+    # Button to go back to Data Collection Page
+    if st.button('Go to Data Collection'):
+        st.switch_page('pages/01_data_collection.py')
+    # Button to navigate to Data Organization Page and pass data
+    if st.button('Go to Data Organization with Data'):
+        # Navigating to Data Organization Page
+        st.switch_page('pages/02_data_organization.py')
+    if st.button('Proceed to Model Selection'):
+        st.switch_page('pages/03_model_selection.py')
+    if st.button('Proceed to encoding vector storage'):
+        st.switch_page('pages/04_encoding_storage.py')
+    if st.button('Proceed to Q&A Testing'):
+        st.switch_page('pages/05_testing_qa.py')
+    # Check if 'data' state variable is defined
+    if 'data' in st.session_state:
+        st.write("Data Available")
+        st.write("Data (URL dataframe) is defined.")
+    else:
+        st.write("Data (URL dataframe) is not defined.")
+    # Check if 'docs' state variable is defined
+    if 'docs' in st.session_state:
+        st.write("Docs (fetched and stored data collection) is defined.")
+    else:
+        st.write("Docs (fetched and stored data collection) is not defined.")
+    # Render the navigation menu
+    # menu()
+if __name__ == '__main__':
+    main()

pages/01_data_collection.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import streamlit as st
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from datetime import datetime
+def find_linked_urls_and_title(url):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            links = soup.find_all('a')
+            urls = {link.get('href') for link in links if link.get('href') is not None}
+            title_tag = soup.find('title')
+            page_title = title_tag.text if title_tag else 'No Title Found'
+            return urls, page_title
+        else:
+            st.write(f"Failed to retrieve {url}")
+            return set(), 'No Title Found'
+    except Exception as e:
+        st.write(f"An error occurred with {url}: {e}")
+        return set(), 'No Title Found'
+def convert_to_absolute_urls(base_url, links):
+    return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
+def categorize_links(base_url, links):
+    internal_links, external_links = set(), set()
+    for link in links:
+        if urlparse(link).netloc == urlparse(base_url).netloc:
+            internal_links.add(link)
+        else:
+            external_links.add(link)
+    return internal_links, external_links
+def display_editable_table(df):
+    edited_df = st.data_editor(data=df, key="data_editor_key", num_rows="dynamic")  # Add num_rows="dynamic" to allow adding/deleting rows
+    return edited_df
+def prepare_dataframe(df):
+    if "Ignore" not in df.columns:
+        df["Ignore"] = False  # Initialize all values as False
+    return df
+def store_data(df):
+    st.session_state['data'] = df
+def main():
+    #menu()
+    st.title("Data Source Configuration")
+    # Initialize 'scanned_urls' with all columns, including 'Ignore'
+    if 'scanned_urls' not in st.session_state:
+        st.session_state['scanned_urls'] = pd.DataFrame(columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore'])
+    st.subheader("Scan Websites for URLs")
+    url_input = st.text_area("Enter URLs to scan, separated by new lines:", "https://fubarlabs.org")
+    url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()]
+    scan_button_clicked = st.button("Scan URLs")
+    if scan_button_clicked:
+        for url in url_list:
+            unique_urls, page_title = find_linked_urls_and_title(url)
+            scan_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            absolute_urls = convert_to_absolute_urls(url, unique_urls)
+            internal_links, external_links = categorize_links(url, absolute_urls)
+            new_entries = pd.DataFrame([(url, 'Internal', page_title, scan_datetime, False) for url in internal_links] +
+                                       [(url, 'External', page_title, scan_datetime, False) for url in external_links],
+                                       columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore'])  # Include 'Ignore' column
+            st.session_state['scanned_urls'] = pd.concat([st.session_state['scanned_urls'], new_entries]).drop_duplicates().reset_index(drop=True)
+            store_data(st.session_state['scanned_urls'])
+    if not st.session_state['scanned_urls'].empty:
+        # Prepare the dataframe, this now includes the 'Ignore' column from the start
+        prepared_df = prepare_dataframe(st.session_state['scanned_urls'])
+        # Display the editable table with an "Ignore" column
+        edited_df = display_editable_table(prepared_df)
+        if edited_df is not None:
+            st.session_state['scanned_urls'] = edited_df
+        # Access the edits made to the table
+        if "data_editor_key" in st.session_state:
+            edits = st.session_state["data_editor_key"]
+            st.write("Edits made to the table:")
+            st.write(edits)
+        if st.button('Proceed to Data Organization'):
+            st.switch_page('pages/02_data_organization.py')
+if __name__ == "__main__":
+    main()

pages/02_data_organization.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# 02_data_organization.py
+import streamlit as st
+from langchain_community.document_loaders import AsyncHtmlLoader
+from langchain.schema import Document
+import json
+from typing import Iterable
+import asyncio
+from urllib.parse import urlparse
+# Async fetch function
+async def fetch_documents(urls):
+    loader = AsyncHtmlLoader(urls)
+    docs = await loader.aload()
+    return docs
+def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
+    with open(file_path, 'w') as jsonl_file:
+        for doc in array:
+            if hasattr(doc, 'to_dict'):
+                jsonl_file.write(json.dumps(doc.to_dict()) + '\n')
+            else:
+                jsonl_file.write(json.dumps(doc.__dict__) + '\n')
+def load_docs_from_jsonl(file_path) -> Iterable[Document]:
+    array = []
+    with open(file_path, 'r') as jsonl_file:
+        for line in jsonl_file:
+            data = json.loads(line)
+            obj = Document(**data)
+            array.append(obj)
+    return array
+def is_valid_url(url):
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+def fetch_clean_organize_page():
+    st.title("Fetch, Clean, and Organize Documents")
+    # Check if 'data' exists in the session state
+    if 'data' not in st.session_state:
+        st.warning("No data found. Please go back to the previous page and scan URLs first.")
+        return
+    data = st.session_state['data']
+    st.write("URLs to fetch and clean:")
+    st.write(data)
+    # Filter out URLs marked as "Ignore" and invalid URLs
+    valid_urls = data[(data['Ignore'] == False) & (data['URL'].apply(is_valid_url))]['URL'].tolist()
+    if st.button("Fetch Documents"):
+        docs = asyncio.run(fetch_documents(valid_urls))
+        st.session_state['docs'] = docs
+        st.write(f"Fetched {len(st.session_state['docs'])} documents.")
+    if 'docs' in st.session_state:
+        if st.button("Save Documents as JSON"):
+            save_docs_to_jsonl(st.session_state['docs'], "documents.jsonl")
+            st.success("Documents saved as JSON.")
+            # Provide download link (streamlit >= 0.88.0)
+            with open("documents.jsonl", "rb") as file:
+                btn = st.download_button(
+                    label="Download JSON",
+                    data=file,
+                    file_name="documents.jsonl",
+                    mime="application/octet-stream"
+                )
+# Assuming this function is called in your app
+fetch_clean_organize_page()

pages/03_model_selection.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import streamlit as st
+st.title('Model Selection')
+# Introduction
+st.write("Select the embedding model and the large language model (LLM) for processing.")
+# Embedding Model Selection
+embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
+selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
+# LLM Model Selection
+llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
+selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
+# Display selections (for demonstration)
+st.write("Selected Embedding Model:", selected_embedding_model)
+st.write("Selected LLM Model:", selected_llm_model)
+# Configuration options for the selected models
+st.header("Model Configuration")
+# Embedding Model Configuration (example)
+if selected_embedding_model == "thenlper/gte-small":
+    # Placeholder for model-specific configuration options
+    st.write("No additional configuration required for this model.")
+else:
+    # Configuration for other models
+    st.write("Configuration options for other models will appear here.")
+# LLM Model Configuration (example)
+if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
+    max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
+    temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
+else:
+    # Configuration for other models
+    st.write("Configuration options for other models will appear here.")
+# Save model selections and configurations
+if st.button("Save Model Configuration"):
+    st.session_state['selected_embedding_model'] = selected_embedding_model
+    st.session_state['selected_llm_model'] = selected_llm_model
+    # Assuming configurations are more complex and vary per model, you might want to store them differently
+    st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
+    st.success("Model configurations saved.")
+if st.button('Proceed to encoding vector storage'):
+    st.switch_page('pages/04_encoding_storage.py')

pages/04_encoding_storage.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import streamlit as st
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain.schema import Document
+import json
+from typing import Iterable
+import os
+from datetime import datetime
+import zipfile
+import tempfile
+def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
+    with open(file_path, 'w') as jsonl_file:
+        for doc in array:
+            jsonl_file.write(doc.json() + '\n')
+def load_docs_from_jsonl(file)->Iterable[Document]:
+    array = []
+    for line in file:
+        data = json.loads(line.decode('utf-8'))
+        obj = Document(**data)
+        array.append(obj)
+    return array
+st.title('Encoding and Storage')
+# Create output directory
+start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+OUTPUT_DIR = "./out"
+# Check if the directory exists, and if not, create it
+if not os.path.exists(OUTPUT_DIR):
+    os.makedirs(OUTPUT_DIR)
+    st.write(f"Directory '{OUTPUT_DIR}' was created.")
+else:
+    st.write(f"Directory '{OUTPUT_DIR}' already exists.")
+# Allow the user to upload the JSON file if missing
+# Allow the user to upload the JSONL file if missing
+if 'docs' not in st.session_state:
+    st.write("Document collection not found in session state.")
+    uploaded_file = st.file_uploader("Upload JSONL file", type=["jsonl"])
+    if uploaded_file is not None:
+        try:
+            docs = load_docs_from_jsonl(uploaded_file)
+            st.session_state['docs'] = docs
+            st.write(f"Loaded {len(docs)} documents from the uploaded file.")
+        except Exception as e:
+            st.error(f"Error loading JSONL file: {str(e)}")
+else:
+    docs = st.session_state['docs']
+    st.write(f"Loaded {len(docs)} documents from the session state.")
+# Show the embedding model
+EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small")
+st.write(f"Selected Embedding Model: {EMBEDDING_MODEL_NAME}")
+# Allow the user to select the device (GPU or CPU)
+device_form = st.form(key='device_form')
+device = device_form.radio("Select Device", ("CUDA", "CPU"))
+submit_device = device_form.form_submit_button(label='Submit Device')
+if submit_device:
+    # Set up the embedding model
+    embedding_model = HuggingFaceEmbeddings(
+        model_name=EMBEDDING_MODEL_NAME,
+        multi_process=True,
+        model_kwargs={"device": device.lower()},
+        encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
+    )
+    # Show the configuration
+    st.write("Embedding Model Configuration:")
+    st.write(embedding_model)
+    # Start the encoding
+    if 'docs' in st.session_state:
+        progress_bar = st.progress(0)
+        total_docs = len(docs)
+        collection_vectorstore = FAISS.from_documents(docs, embedding=embedding_model)
+        st.session_state['collection_vectorstore'] = collection_vectorstore
+        for i in range(total_docs):
+            progress_bar.progress((i + 1) / total_docs)
+        st.write("Encoding completed.")
+    else:
+        st.write("No documents found in the session state.")
+ # Allow saving and downloading the configuration
+if st.button("Save and Download Configuration"):
+    if 'collection_vectorstore' in st.session_state:
+        collection_vectorstore = st.session_state['collection_vectorstore']
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        zip_filename = f"docs_vectors_{timestamp}.zip"
+        with tempfile.TemporaryDirectory() as temp_dir:
+            collection_vectorstore.save_local(f"{temp_dir}/docs_vectors")
+            with zipfile.ZipFile(zip_filename, "w") as zip_file:
+                for root, _, files in os.walk(temp_dir):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        zip_file.write(file_path, os.path.relpath(file_path, temp_dir))
+            with open(zip_filename, "rb") as zip_file:
+                zip_bytes = zip_file.read()
+            st.download_button(
+                label="Download Configuration",
+                data=zip_bytes,
+                file_name=zip_filename,
+                mime="application/zip",
+            )
+        st.success("Configuration saved and downloaded.")
+    else:
+        st.warning("No vector store found. Please make sure the encoding is completed.")
+if st.button('Proceed to Q&A Testing'):
+    st.switch_page('pages/05_testing_qa.py')

pages/05_testing_qa.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import streamlit as st
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+import tempfile
+import zipfile
+import os
+st.title('Testing and QA')
+# Dynamically load the selected models from the session state
+EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small")
+LLM_MODEL_NAME = st.session_state.get('selected_llm_model', "mistralai/Mistral-7B-Instruct-v0.2")
+# Initialization block for embedding_model, with a debug message
+if 'embedding_model' not in st.session_state:
+    EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small")
+    st.session_state['embedding_model'] = HuggingFaceEmbeddings(
+        model_name=EMBEDDING_MODEL_NAME,
+        multi_process=True,
+        model_kwargs={"device": "cpu"},
+        encode_kwargs={"normalize_embeddings": True},
+    )
+    st.info("embedding_model has been initialized.")  # Debug message for initialization
+else:
+    st.info("embedding_model was already initialized.")  # Debug message if already initialized
+# Now that we've ensured embedding_model is initialized, we can safely access it
+embedding_model = st.session_state['embedding_model']
+st.write("Accessing embedding_model...")  # Debug message for accessing
+# Form for LLM settings, allowing dynamic model selection
+with st.form("llm_settings_form"):
+    st.subheader("LLM Settings")
+    repo_id = st.text_input("Repo ID", value=LLM_MODEL_NAME, key="repo_id")
+    max_new_tokens = st.number_input("Max New Tokens", value=250, key="max_new_tokens")
+    top_k = st.number_input("Top K", value=3, key="top_k")
+    top_p = st.number_input("Top P", value=0.95, key="top_p")
+    typical_p = st.number_input("Typical P", value=0.95, key="typical_p")
+    temperature = st.number_input("Temperature", value=0.01, key="temperature")
+    repetition_penalty = st.number_input("Repetition Penalty", value=1.035, key="repetition_penalty")
+    submitted = st.form_submit_button("Update LLM Settings")
+    if submitted:
+        st.session_state['llm'] = HuggingFaceEndpoint(
+            repo_id=repo_id,
+            max_new_tokens=max_new_tokens,
+            top_k=top_k,
+            top_p=top_p,
+            typical_p=typical_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+        )
+        st.success("LLM settings updated.")
+# Vector store upload and setup
+if 'collection_vectorstore' not in st.session_state:
+    uploaded_file = st.file_uploader("Upload Vector Store ZIP", type=["zip"])
+    if uploaded_file is not None:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+            docs_vectors_path = os.path.join(temp_dir, "docs_vectors")
+            st.session_state['collection_vectorstore'] = FAISS.load_local(docs_vectors_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
+            st.success("Vector store uploaded and loaded successfully.")
+             # Create the retriever as soon as the vector store is created
+            st.session_state['retriever'] = st.session_state['collection_vectorstore'].as_retriever()
+            st.info("Retriever has been created.")  # Debug message to confirm the retriever's creation
+# Check if LLM and vector store are ready
+if 'llm' in st.session_state and 'collection_vectorstore' in st.session_state:
+    # Use a button to indicate when to update the prompt template
+    if st.button("Update Prompt Template"):
+        # Assuming you have a text area where users input the new template
+        new_template = st.text_area("Enter new prompt template", key="new_prompt_template")
+        # Update the session state only when the button is pressed
+        st.session_state['prompt_template'] = new_template
+        st.success("Prompt template updated.")
+    # Ensure there's a default prompt template
+    if 'prompt_template' not in st.session_state:
+        st.session_state['prompt_template'] = "You are a knowledgeable assistant answering the following question based on the provided documents: {context} Question: {question}"
+    # Display the current template for editing
+    current_template = st.text_area("Edit Prompt Template", value=st.session_state['prompt_template'], key="current_prompt_template")
+    # Question input and processing
+question = st.text_input("Enter your question", key="question_input")
+if question:
+    llm = st.session_state['llm']
+    prompt = ChatPromptTemplate.from_template(current_template)
+    retriever = st.session_state['retriever']
+    chain = (
+        {"context": retriever, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+    if st.button("Ask"):
+        result = chain.invoke(question)
+        st.subheader("Answer:")
+        st.write(result)
+else:
+    st.warning("Please configure and submit the LLM settings and ensure the vector store is loaded to ask questions.")