rianders commited on
Commit
eceebf5
1 Parent(s): 911d489

added pages

Browse files
main_page.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def main():
4
+ st.title("Data Processing Interface")
5
+
6
+ # Introduction or project description
7
+ st.write("""
8
+ Welcome to the Data Processing Interface! This application facilitates the mining, processing,
9
+ and embedding of data from public GitHub repositories. Navigate through the sidebar to access
10
+ different stages of the process.
11
+ """)
12
+
13
+ # Display the steps and their status
14
+ st.header("Process Overview")
15
+ steps = [
16
+ "Data Source Configuration",
17
+ "Data Loading",
18
+ "Model Selection and Configuration",
19
+ "Processing and Embedding"
20
+ ]
21
+
22
+ # Placeholder for checking the completion of each step
23
+ # This part can be updated to reflect the actual status dynamically
24
+ completion_status = {
25
+ "Data Source Configuration": False,
26
+ "Data Loading": False,
27
+ "Model Selection and Configuration": False,
28
+ "Processing and Embedding": False
29
+ }
30
+
31
+ # Display each step and its completion status
32
+ for step in steps:
33
+ if completion_status[step]:
34
+ st.success(f"{step}: Completed ✔️")
35
+ else:
36
+ st.warning(f"{step}: Not Completed ❌")
37
+
38
+ st.write("""
39
+ ### Instructions
40
+ - Use the **sidebar** to navigate to each step.
41
+ - Complete each step in sequence to ensure data is correctly processed and embedded.
42
+ - You can revisit and modify previous steps as needed.
43
+ """)
44
+
45
+ if __name__ == "__main__":
46
+ main()
pages/data_loading.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def main():
4
+ st.title("Data Loading")
5
+
6
+ # Introduction or instruction
7
+ st.write("Select directories and file types to process from the configured data source.")
8
+
9
+ # Assume we have a function `list_repo_directories(repo_url)` that returns a list of directories in the repo
10
+ # This is a placeholder list to demonstrate UI elements
11
+ directories = ["src", "docs", "examples", "tests"] # Example directory names
12
+
13
+ # Directory selection
14
+ selected_directories = st.multiselect("Select Directories", options=directories, default=directories)
15
+ if selected_directories:
16
+ # Save the selected directories for later processing
17
+ st.session_state['selected_directories'] = selected_directories
18
+ st.success(f"Selected directories: {', '.join(selected_directories)}")
19
+ else:
20
+ st.error("Please select at least one directory.")
21
+
22
+ # File type filtering
23
+ file_types = ["pdf", "txt", "md"] # Example file types
24
+ selected_file_types = st.multiselect("Select File Types to Include", options=file_types, default=file_types)
25
+ if selected_file_types:
26
+ # Save the selected file types for later processing
27
+ st.session_state['selected_file_types'] = selected_file_types
28
+ st.success(f"Selected file types: {', '.join(selected_file_types)}")
29
+ else:
30
+ st.error("Please select at least one file type.")
31
+
32
+ # Optional: Navigation or action buttons
33
+ # Example: Button to proceed to the next step if this page's task is completed
34
+ if st.button("Proceed to Model Selection and Configuration"):
35
+ # Change the page in the session state, assuming you have set up session-based navigation in app.py
36
+ st.session_state.page = 'model_selection'
37
+
38
+ if __name__ == "__main__":
39
+ main()
pages/data_source_config.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+
4
+ def main():
5
+ st.title("Data Source Configuration")
6
+
7
+ # Explanation or introduction
8
+ st.write("Configure the source from which to mine data, including the GitHub repository and the output directory for storing generated data.")
9
+
10
+ # Repository selection
11
+ repo_url = st.text_input("GitHub Repository URL", "https://github.com/username/repository")
12
+
13
+ # Validate the URL (basic validation for demonstration)
14
+ if "github.com" not in repo_url:
15
+ st.error("Please enter a valid GitHub repository URL.")
16
+ else:
17
+ # Assuming validation passed, store the repo URL in session state or proceed with further processing
18
+ st.session_state['repo_url'] = repo_url
19
+ st.success("Repository URL saved.")
20
+
21
+ # Output directory selection
22
+ default_dir = os.path.join(".", "output_data") # Default directory path
23
+ out_dir = st.text_input("Output Directory for Generated Data", value=default_dir)
24
+
25
+ # Directory existence check (Create if doesn't exist)
26
+ if st.button("Save Output Directory"):
27
+ try:
28
+ os.makedirs(out_dir, exist_ok=True) # Create the directory if it does not exist
29
+ st.session_state['output_dir'] = out_dir
30
+ st.success(f"Output directory set to: {out_dir}")
31
+ except Exception as e:
32
+ st.error(f"Failed to create the directory: {str(e)}")
33
+
34
+ # Optional: Provide navigation or action buttons
35
+ # For example, a button to proceed to the next step if this page's task is completed
36
+ if st.button("Proceed to Data Loading"):
37
+ # Change the page in the session state, assuming you have set up session-based navigation in app.py
38
+ st.session_state.page = 'data_loading'
39
+
40
+ if __name__ == "__main__":
41
+ main()
pages/model_selection.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def main():
4
+ st.title("Model Selection and Configuration")
5
+
6
+ # Introduction
7
+ st.write("Select the embedding model and the large language model (LLM) for processing.")
8
+
9
+ # Embedding Model Selection
10
+ embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
11
+ selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
12
+
13
+ # LLM Model Selection
14
+ llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
15
+ selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
16
+
17
+ # Display selections (for demonstration)
18
+ st.write("Selected Embedding Model:", selected_embedding_model)
19
+ st.write("Selected LLM Model:", selected_llm_model)
20
+
21
+ # Configuration options for the selected models
22
+ st.header("Model Configuration")
23
+
24
+ # Embedding Model Configuration (example)
25
+ if selected_embedding_model == "thenlper/gte-small":
26
+ # Placeholder for model-specific configuration options
27
+ st.write("No additional configuration required for this model.")
28
+ else:
29
+ # Configuration for other models
30
+ st.write("Configuration options for other models will appear here.")
31
+
32
+ # LLM Model Configuration (example)
33
+ if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
34
+ max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
35
+ temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
36
+ else:
37
+ # Configuration for other models
38
+ st.write("Configuration options for other models will appear here.")
39
+
40
+ # Save model selections and configurations
41
+ if st.button("Save Model Configuration"):
42
+ st.session_state['selected_embedding_model'] = selected_embedding_model
43
+ st.session_state['selected_llm_model'] = selected_llm_model
44
+ # Assuming configurations are more complex and vary per model, you might want to store them differently
45
+ st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
46
+ st.success("Model configurations saved.")
47
+
48
+ # Optional: Proceed to the next step
49
+ # st.session_state.page = 'processing_embedding'
50
+
51
+ if __name__ == "__main__":
52
+ main()
pages/processing_embedding.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time # Used to simulate processing time
3
+
4
+ def main():
5
+ st.title("Processing and Embedding")
6
+
7
+ # Introduction
8
+ st.write("Process the selected data using the configured models and save the results.")
9
+
10
+ # Start Processing Button
11
+ if st.button("Start Processing"):
12
+ with st.spinner("Processing..."):
13
+ # Simulate processing time
14
+ time.sleep(5) # Simulate processing time. Replace with actual processing logic.
15
+
16
+ # Update session state or variables to indicate processing is complete
17
+ st.session_state['processing_complete'] = True
18
+ st.success("Processing completed successfully!")
19
+
20
+ # Show progress only if processing has started or completed
21
+ if 'processing_complete' in st.session_state and st.session_state['processing_complete']:
22
+ st.progress(100)
23
+ st.write("Data has been processed and embedded successfully.")
24
+ else:
25
+ st.progress(0)
26
+
27
+ # Parameter Tuning Section (Placeholder)
28
+ st.header("Parameter Tuning")
29
+ st.write("Adjust processing parameters if necessary. (This section is a placeholder.)")
30
+
31
+ # Saving Options
32
+ st.header("Save Results")
33
+ if st.checkbox("Save Preprocessed Pages"):
34
+ # Placeholder for saving logic
35
+ st.write("Preprocessed pages will be saved.")
36
+ if st.checkbox("Save Processed Pages"):
37
+ # Placeholder for saving logic
38
+ st.write("Processed pages will be saved.")
39
+ if st.checkbox("Save Vectors Store Data"):
40
+ # Placeholder for saving logic
41
+ st.write("Vectors store data will be saved.")
42
+
43
+ # Optional: Provide navigation to next steps or back to configuration
44
+ col1, col2 = st.columns(2)
45
+ with col1:
46
+ if st.button("Back to Model Selection"):
47
+ st.session_state.page = 'model_selection'
48
+ with col2:
49
+ if st.button("Complete and Exit"):
50
+ st.session_state.page = 'main_page' # Assuming you want to navigate back to the main page
51
+
52
+ if __name__ == "__main__":
53
+ main()