Spaces:
Sleeping
Sleeping
added pages
Browse files- main_page.py +46 -0
- pages/data_loading.py +39 -0
- pages/data_source_config.py +41 -0
- pages/model_selection.py +52 -0
- pages/processing_embedding.py +53 -0
main_page.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def main():
|
4 |
+
st.title("Data Processing Interface")
|
5 |
+
|
6 |
+
# Introduction or project description
|
7 |
+
st.write("""
|
8 |
+
Welcome to the Data Processing Interface! This application facilitates the mining, processing,
|
9 |
+
and embedding of data from public GitHub repositories. Navigate through the sidebar to access
|
10 |
+
different stages of the process.
|
11 |
+
""")
|
12 |
+
|
13 |
+
# Display the steps and their status
|
14 |
+
st.header("Process Overview")
|
15 |
+
steps = [
|
16 |
+
"Data Source Configuration",
|
17 |
+
"Data Loading",
|
18 |
+
"Model Selection and Configuration",
|
19 |
+
"Processing and Embedding"
|
20 |
+
]
|
21 |
+
|
22 |
+
# Placeholder for checking the completion of each step
|
23 |
+
# This part can be updated to reflect the actual status dynamically
|
24 |
+
completion_status = {
|
25 |
+
"Data Source Configuration": False,
|
26 |
+
"Data Loading": False,
|
27 |
+
"Model Selection and Configuration": False,
|
28 |
+
"Processing and Embedding": False
|
29 |
+
}
|
30 |
+
|
31 |
+
# Display each step and its completion status
|
32 |
+
for step in steps:
|
33 |
+
if completion_status[step]:
|
34 |
+
st.success(f"{step}: Completed ✔️")
|
35 |
+
else:
|
36 |
+
st.warning(f"{step}: Not Completed ❌")
|
37 |
+
|
38 |
+
st.write("""
|
39 |
+
### Instructions
|
40 |
+
- Use the **sidebar** to navigate to each step.
|
41 |
+
- Complete each step in sequence to ensure data is correctly processed and embedded.
|
42 |
+
- You can revisit and modify previous steps as needed.
|
43 |
+
""")
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
main()
|
pages/data_loading.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def main():
|
4 |
+
st.title("Data Loading")
|
5 |
+
|
6 |
+
# Introduction or instruction
|
7 |
+
st.write("Select directories and file types to process from the configured data source.")
|
8 |
+
|
9 |
+
# Assume we have a function `list_repo_directories(repo_url)` that returns a list of directories in the repo
|
10 |
+
# This is a placeholder list to demonstrate UI elements
|
11 |
+
directories = ["src", "docs", "examples", "tests"] # Example directory names
|
12 |
+
|
13 |
+
# Directory selection
|
14 |
+
selected_directories = st.multiselect("Select Directories", options=directories, default=directories)
|
15 |
+
if selected_directories:
|
16 |
+
# Save the selected directories for later processing
|
17 |
+
st.session_state['selected_directories'] = selected_directories
|
18 |
+
st.success(f"Selected directories: {', '.join(selected_directories)}")
|
19 |
+
else:
|
20 |
+
st.error("Please select at least one directory.")
|
21 |
+
|
22 |
+
# File type filtering
|
23 |
+
file_types = ["pdf", "txt", "md"] # Example file types
|
24 |
+
selected_file_types = st.multiselect("Select File Types to Include", options=file_types, default=file_types)
|
25 |
+
if selected_file_types:
|
26 |
+
# Save the selected file types for later processing
|
27 |
+
st.session_state['selected_file_types'] = selected_file_types
|
28 |
+
st.success(f"Selected file types: {', '.join(selected_file_types)}")
|
29 |
+
else:
|
30 |
+
st.error("Please select at least one file type.")
|
31 |
+
|
32 |
+
# Optional: Navigation or action buttons
|
33 |
+
# Example: Button to proceed to the next step if this page's task is completed
|
34 |
+
if st.button("Proceed to Model Selection and Configuration"):
|
35 |
+
# Change the page in the session state, assuming you have set up session-based navigation in app.py
|
36 |
+
st.session_state.page = 'model_selection'
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
main()
|
pages/data_source_config.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
|
4 |
+
def main():
|
5 |
+
st.title("Data Source Configuration")
|
6 |
+
|
7 |
+
# Explanation or introduction
|
8 |
+
st.write("Configure the source from which to mine data, including the GitHub repository and the output directory for storing generated data.")
|
9 |
+
|
10 |
+
# Repository selection
|
11 |
+
repo_url = st.text_input("GitHub Repository URL", "https://github.com/username/repository")
|
12 |
+
|
13 |
+
# Validate the URL (basic validation for demonstration)
|
14 |
+
if "github.com" not in repo_url:
|
15 |
+
st.error("Please enter a valid GitHub repository URL.")
|
16 |
+
else:
|
17 |
+
# Assuming validation passed, store the repo URL in session state or proceed with further processing
|
18 |
+
st.session_state['repo_url'] = repo_url
|
19 |
+
st.success("Repository URL saved.")
|
20 |
+
|
21 |
+
# Output directory selection
|
22 |
+
default_dir = os.path.join(".", "output_data") # Default directory path
|
23 |
+
out_dir = st.text_input("Output Directory for Generated Data", value=default_dir)
|
24 |
+
|
25 |
+
# Directory existence check (Create if doesn't exist)
|
26 |
+
if st.button("Save Output Directory"):
|
27 |
+
try:
|
28 |
+
os.makedirs(out_dir, exist_ok=True) # Create the directory if it does not exist
|
29 |
+
st.session_state['output_dir'] = out_dir
|
30 |
+
st.success(f"Output directory set to: {out_dir}")
|
31 |
+
except Exception as e:
|
32 |
+
st.error(f"Failed to create the directory: {str(e)}")
|
33 |
+
|
34 |
+
# Optional: Provide navigation or action buttons
|
35 |
+
# For example, a button to proceed to the next step if this page's task is completed
|
36 |
+
if st.button("Proceed to Data Loading"):
|
37 |
+
# Change the page in the session state, assuming you have set up session-based navigation in app.py
|
38 |
+
st.session_state.page = 'data_loading'
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
main()
|
pages/model_selection.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def main():
|
4 |
+
st.title("Model Selection and Configuration")
|
5 |
+
|
6 |
+
# Introduction
|
7 |
+
st.write("Select the embedding model and the large language model (LLM) for processing.")
|
8 |
+
|
9 |
+
# Embedding Model Selection
|
10 |
+
embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
|
11 |
+
selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
|
12 |
+
|
13 |
+
# LLM Model Selection
|
14 |
+
llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
|
15 |
+
selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
|
16 |
+
|
17 |
+
# Display selections (for demonstration)
|
18 |
+
st.write("Selected Embedding Model:", selected_embedding_model)
|
19 |
+
st.write("Selected LLM Model:", selected_llm_model)
|
20 |
+
|
21 |
+
# Configuration options for the selected models
|
22 |
+
st.header("Model Configuration")
|
23 |
+
|
24 |
+
# Embedding Model Configuration (example)
|
25 |
+
if selected_embedding_model == "thenlper/gte-small":
|
26 |
+
# Placeholder for model-specific configuration options
|
27 |
+
st.write("No additional configuration required for this model.")
|
28 |
+
else:
|
29 |
+
# Configuration for other models
|
30 |
+
st.write("Configuration options for other models will appear here.")
|
31 |
+
|
32 |
+
# LLM Model Configuration (example)
|
33 |
+
if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
|
34 |
+
max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
|
35 |
+
temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
|
36 |
+
else:
|
37 |
+
# Configuration for other models
|
38 |
+
st.write("Configuration options for other models will appear here.")
|
39 |
+
|
40 |
+
# Save model selections and configurations
|
41 |
+
if st.button("Save Model Configuration"):
|
42 |
+
st.session_state['selected_embedding_model'] = selected_embedding_model
|
43 |
+
st.session_state['selected_llm_model'] = selected_llm_model
|
44 |
+
# Assuming configurations are more complex and vary per model, you might want to store them differently
|
45 |
+
st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
|
46 |
+
st.success("Model configurations saved.")
|
47 |
+
|
48 |
+
# Optional: Proceed to the next step
|
49 |
+
# st.session_state.page = 'processing_embedding'
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
main()
|
pages/processing_embedding.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time # Used to simulate processing time
|
3 |
+
|
4 |
+
def main():
|
5 |
+
st.title("Processing and Embedding")
|
6 |
+
|
7 |
+
# Introduction
|
8 |
+
st.write("Process the selected data using the configured models and save the results.")
|
9 |
+
|
10 |
+
# Start Processing Button
|
11 |
+
if st.button("Start Processing"):
|
12 |
+
with st.spinner("Processing..."):
|
13 |
+
# Simulate processing time
|
14 |
+
time.sleep(5) # Simulate processing time. Replace with actual processing logic.
|
15 |
+
|
16 |
+
# Update session state or variables to indicate processing is complete
|
17 |
+
st.session_state['processing_complete'] = True
|
18 |
+
st.success("Processing completed successfully!")
|
19 |
+
|
20 |
+
# Show progress only if processing has started or completed
|
21 |
+
if 'processing_complete' in st.session_state and st.session_state['processing_complete']:
|
22 |
+
st.progress(100)
|
23 |
+
st.write("Data has been processed and embedded successfully.")
|
24 |
+
else:
|
25 |
+
st.progress(0)
|
26 |
+
|
27 |
+
# Parameter Tuning Section (Placeholder)
|
28 |
+
st.header("Parameter Tuning")
|
29 |
+
st.write("Adjust processing parameters if necessary. (This section is a placeholder.)")
|
30 |
+
|
31 |
+
# Saving Options
|
32 |
+
st.header("Save Results")
|
33 |
+
if st.checkbox("Save Preprocessed Pages"):
|
34 |
+
# Placeholder for saving logic
|
35 |
+
st.write("Preprocessed pages will be saved.")
|
36 |
+
if st.checkbox("Save Processed Pages"):
|
37 |
+
# Placeholder for saving logic
|
38 |
+
st.write("Processed pages will be saved.")
|
39 |
+
if st.checkbox("Save Vectors Store Data"):
|
40 |
+
# Placeholder for saving logic
|
41 |
+
st.write("Vectors store data will be saved.")
|
42 |
+
|
43 |
+
# Optional: Provide navigation to next steps or back to configuration
|
44 |
+
col1, col2 = st.columns(2)
|
45 |
+
with col1:
|
46 |
+
if st.button("Back to Model Selection"):
|
47 |
+
st.session_state.page = 'model_selection'
|
48 |
+
with col2:
|
49 |
+
if st.button("Complete and Exit"):
|
50 |
+
st.session_state.page = 'main_page' # Assuming you want to navigate back to the main page
|
51 |
+
|
52 |
+
if __name__ == "__main__":
|
53 |
+
main()
|