Spaces:

rianders
/

mpi_data_store

Sleeping

App Files Files Community

rianders commited on Mar 24, 2024

Commit

eceebf5

1 Parent(s): 911d489

added pages

Browse files

Files changed (5) hide show

main_page.py +46 -0
pages/data_loading.py +39 -0
pages/data_source_config.py +41 -0
pages/model_selection.py +52 -0
pages/processing_embedding.py +53 -0

main_page.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+def main():
+    st.title("Data Processing Interface")
+    # Introduction or project description
+    st.write("""
+    Welcome to the Data Processing Interface! This application facilitates the mining, processing,
+    and embedding of data from public GitHub repositories. Navigate through the sidebar to access
+    different stages of the process.
+    """)
+    # Display the steps and their status
+    st.header("Process Overview")
+    steps = [
+        "Data Source Configuration",
+        "Data Loading",
+        "Model Selection and Configuration",
+        "Processing and Embedding"
+    ]
+    # Placeholder for checking the completion of each step
+    # This part can be updated to reflect the actual status dynamically
+    completion_status = {
+        "Data Source Configuration": False,
+        "Data Loading": False,
+        "Model Selection and Configuration": False,
+        "Processing and Embedding": False
+    }
+    # Display each step and its completion status
+    for step in steps:
+        if completion_status[step]:
+            st.success(f"{step}: Completed ✔️")
+        else:
+            st.warning(f"{step}: Not Completed ❌")
+    st.write("""
+    ### Instructions
+    - Use the **sidebar** to navigate to each step.
+    - Complete each step in sequence to ensure data is correctly processed and embedded.
+    - You can revisit and modify previous steps as needed.
+    """)
+if __name__ == "__main__":
+    main()

pages/data_loading.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+def main():
+    st.title("Data Loading")
+    # Introduction or instruction
+    st.write("Select directories and file types to process from the configured data source.")
+    # Assume we have a function `list_repo_directories(repo_url)` that returns a list of directories in the repo
+    # This is a placeholder list to demonstrate UI elements
+    directories = ["src", "docs", "examples", "tests"]  # Example directory names
+    # Directory selection
+    selected_directories = st.multiselect("Select Directories", options=directories, default=directories)
+    if selected_directories:
+        # Save the selected directories for later processing
+        st.session_state['selected_directories'] = selected_directories
+        st.success(f"Selected directories: {', '.join(selected_directories)}")
+    else:
+        st.error("Please select at least one directory.")
+    # File type filtering
+    file_types = ["pdf", "txt", "md"]  # Example file types
+    selected_file_types = st.multiselect("Select File Types to Include", options=file_types, default=file_types)
+    if selected_file_types:
+        # Save the selected file types for later processing
+        st.session_state['selected_file_types'] = selected_file_types
+        st.success(f"Selected file types: {', '.join(selected_file_types)}")
+    else:
+        st.error("Please select at least one file type.")
+    # Optional: Navigation or action buttons
+    # Example: Button to proceed to the next step if this page's task is completed
+    if st.button("Proceed to Model Selection and Configuration"):
+        # Change the page in the session state, assuming you have set up session-based navigation in app.py
+        st.session_state.page = 'model_selection'
+if __name__ == "__main__":
+    main()

pages/data_source_config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import streamlit as st
+import os
+def main():
+    st.title("Data Source Configuration")
+    # Explanation or introduction
+    st.write("Configure the source from which to mine data, including the GitHub repository and the output directory for storing generated data.")
+    # Repository selection
+    repo_url = st.text_input("GitHub Repository URL", "https://github.com/username/repository")
+    # Validate the URL (basic validation for demonstration)
+    if "github.com" not in repo_url:
+        st.error("Please enter a valid GitHub repository URL.")
+    else:
+        # Assuming validation passed, store the repo URL in session state or proceed with further processing
+        st.session_state['repo_url'] = repo_url
+        st.success("Repository URL saved.")
+    # Output directory selection
+    default_dir = os.path.join(".", "output_data") # Default directory path
+    out_dir = st.text_input("Output Directory for Generated Data", value=default_dir)
+    # Directory existence check (Create if doesn't exist)
+    if st.button("Save Output Directory"):
+        try:
+            os.makedirs(out_dir, exist_ok=True) # Create the directory if it does not exist
+            st.session_state['output_dir'] = out_dir
+            st.success(f"Output directory set to: {out_dir}")
+        except Exception as e:
+            st.error(f"Failed to create the directory: {str(e)}")
+    # Optional: Provide navigation or action buttons
+    # For example, a button to proceed to the next step if this page's task is completed
+    if st.button("Proceed to Data Loading"):
+        # Change the page in the session state, assuming you have set up session-based navigation in app.py
+        st.session_state.page = 'data_loading'
+if __name__ == "__main__":
+    main()

pages/model_selection.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import streamlit as st
+def main():
+    st.title("Model Selection and Configuration")
+    # Introduction
+    st.write("Select the embedding model and the large language model (LLM) for processing.")
+    # Embedding Model Selection
+    embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
+    selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
+    # LLM Model Selection
+    llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
+    selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
+    # Display selections (for demonstration)
+    st.write("Selected Embedding Model:", selected_embedding_model)
+    st.write("Selected LLM Model:", selected_llm_model)
+    # Configuration options for the selected models
+    st.header("Model Configuration")
+    # Embedding Model Configuration (example)
+    if selected_embedding_model == "thenlper/gte-small":
+        # Placeholder for model-specific configuration options
+        st.write("No additional configuration required for this model.")
+    else:
+        # Configuration for other models
+        st.write("Configuration options for other models will appear here.")
+    # LLM Model Configuration (example)
+    if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
+        max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
+        temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
+    else:
+        # Configuration for other models
+        st.write("Configuration options for other models will appear here.")
+    # Save model selections and configurations
+    if st.button("Save Model Configuration"):
+        st.session_state['selected_embedding_model'] = selected_embedding_model
+        st.session_state['selected_llm_model'] = selected_llm_model
+        # Assuming configurations are more complex and vary per model, you might want to store them differently
+        st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
+        st.success("Model configurations saved.")
+        # Optional: Proceed to the next step
+        # st.session_state.page = 'processing_embedding'
+if __name__ == "__main__":
+    main()

pages/processing_embedding.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import streamlit as st
+import time  # Used to simulate processing time
+def main():
+    st.title("Processing and Embedding")
+    # Introduction
+    st.write("Process the selected data using the configured models and save the results.")
+    # Start Processing Button
+    if st.button("Start Processing"):
+        with st.spinner("Processing..."):
+            # Simulate processing time
+            time.sleep(5)  # Simulate processing time. Replace with actual processing logic.
+            # Update session state or variables to indicate processing is complete
+            st.session_state['processing_complete'] = True
+            st.success("Processing completed successfully!")
+    # Show progress only if processing has started or completed
+    if 'processing_complete' in st.session_state and st.session_state['processing_complete']:
+        st.progress(100)
+        st.write("Data has been processed and embedded successfully.")
+    else:
+        st.progress(0)
+    # Parameter Tuning Section (Placeholder)
+    st.header("Parameter Tuning")
+    st.write("Adjust processing parameters if necessary. (This section is a placeholder.)")
+    # Saving Options
+    st.header("Save Results")
+    if st.checkbox("Save Preprocessed Pages"):
+        # Placeholder for saving logic
+        st.write("Preprocessed pages will be saved.")
+    if st.checkbox("Save Processed Pages"):
+        # Placeholder for saving logic
+        st.write("Processed pages will be saved.")
+    if st.checkbox("Save Vectors Store Data"):
+        # Placeholder for saving logic
+        st.write("Vectors store data will be saved.")
+    # Optional: Provide navigation to next steps or back to configuration
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("Back to Model Selection"):
+            st.session_state.page = 'model_selection'
+    with col2:
+        if st.button("Complete and Exit"):
+            st.session_state.page = 'main_page'  # Assuming you want to navigate back to the main page
+if __name__ == "__main__":
+    main()