enhance-ai-training-data

Sleeping

App Files Files Community

Alexander Watson commited on Jul 2, 2024

Commit

06594f2

1 Parent(s): fb73a92

update interfaces

Browse files

Files changed (1) hide show

app.py +105 -83

app.py CHANGED Viewed

@@ -11,7 +11,11 @@ import requests
 import streamlit as st
 from datasets import load_dataset
 from gretel_client import Gretel
-from navigator_helpers import DataAugmentationConfig, DataAugmenter, StreamlitLogHandler
 # Create a StringIO buffer to capture the logging output
 log_buffer = StringIO()
@@ -103,6 +107,9 @@ def main():
         )
         if "gretel" not in st.session_state:
             st.session_state.gretel = None
         if st.button("Validate API Key"):
             if api_key:
                 try:
@@ -340,6 +347,22 @@ def main():
             st.markdown("---")
             st.markdown("### Format Prompts")
             instruction_format_prompt = st.text_area(
                 "Instruction Format Prompt",
                 value=st.session_state.get(
@@ -365,58 +388,53 @@ def main():
             st.write(
                 "Get started with your current configuration using the SDK code below:"
             )
-            config_text = f"""
-        #!pip install -Uqq git+https://github.com/gretelai/navigator-helpers.git
-        import logging
-        import sys
-        import pandas as pd
-        from navigator_helpers import DataAugmentationConfig, DataAugmenter
-        # Configure the logger
-        logger = logging.getLogger()
-        logger.setLevel(logging.INFO)
-        DATASET = "YOUR_DATASET"
-        API_KEY = "YOUR_API_KEY"
-        df = pd.read_csv(DATASET)
-        # Create the data augmentation configuration
-        config = DataAugmentationConfig(
-            input_fields={st.session_state.selected_fields},
-            output_instruction_field="{output_instruction_field}",
-            output_response_field="{output_response_field}",
-            num_instructions={num_instructions},
-            num_responses={num_responses},
-            temperature={temperature},
-            max_tokens_instruction={max_tokens_instruction},
-            max_tokens_response={max_tokens_response},
-            api_key=API_KEY,
-            navigator_tabular="{navigator_tabular}",
-            navigator_llm="{navigator_llm}",
-            co_teach_llms={co_teach_llms},
-            instruction_format_prompt='''{instruction_format_prompt}''',
-            response_format_prompt='''{response_format_prompt}'''
-        )
-        # Create the data augmenter and perform augmentation
-        augmenter = DataAugmenter(
-            df,
-            config,
-            use_aaa={use_aaa},
-            output_file="results.csv",
-            verbose=True,
-        )
-        new_df = augmenter.augment()
-        """
             st.code(config_text, language="python")
             st.download_button(
                 label="Download SDK Code",
                 data=config_text,
-                file_name="data_augmentation_code.py",
                 mime="text/plain",
             )
@@ -431,20 +449,20 @@ def main():
         if "logs" not in st.session_state:
             st.session_state.logs = []
-        if "augmented_data" not in st.session_state:
-            st.session_state.augmented_data = []
         if start_button:
-            # Clear the augmented data and logs before starting a new generation
-            st.session_state.augmented_data = []
             st.session_state.logs = []
             with st.expander("Synthetic Data", expanded=True):
                 st.subheader("Synthetic Data Generation")
                 progress_bar = st.progress(0)
-                tab1, tab2 = st.tabs(["Augmented Data", "Logs"])
                 with tab1:
-                    augmented_data_placeholder = st.empty()
                     st.info(
                         "Click on the 'Logs' tab to see and debug real-time logging for each record as it is generated by the agents."
                     )
@@ -467,7 +485,7 @@ def main():
                 handler = StreamlitLogHandler(custom_log_handler)
                 logger.addHandler(handler)
-                config = DataAugmentationConfig(
                     input_fields=selected_fields,
                     output_instruction_field=output_instruction_field,
                     output_response_field=output_response_field,
@@ -480,26 +498,28 @@ def main():
                     navigator_tabular=navigator_tabular,
                     navigator_llm=navigator_llm,
                     co_teach_llms=co_teach_llms,
                     instruction_format_prompt=instruction_format_prompt,
                     response_format_prompt=response_format_prompt,
                 )
                 start_time = time.time()
                 with st.spinner("Generating synthetic data..."):
                     for index in range(num_records):
                         row = df.iloc[index]
-                        augmenter = DataAugmenter(
                             pd.DataFrame([row]),
                             config,
                             use_aaa=use_aaa,
                             output_file="results.csv",
                             verbose=True,
                         )
-                        new_df = augmenter.augment()
-                        st.session_state.augmented_data.append(new_df)
-                        augmented_data_placeholder.subheader("Synthetic Data")
-                        augmented_data_placeholder.dataframe(
                             pd.concat(
-                                st.session_state.augmented_data, ignore_index=True
                             )
                         )
                         progress = (index + 1) / num_records
@@ -520,11 +540,11 @@ def main():
                         time.sleep(0.1)
                 logger.removeHandler(handler)
-                st.success("Data augmentation completed!")
             st.stop()
         if stop_button:
-            st.warning("Augmentation stopped by the user.")
             # Get the complete logs from the session state
             complete_logs = st.session_state.logs
@@ -532,22 +552,22 @@ def main():
             # Convert complete logs to JSONL format
             log_jsonl = "\n".join([json.dumps({"log": log}) for log in complete_logs])
-            # Convert augmented data to JSONL format if it exists
-            if st.session_state.augmented_data:
-                augmented_df = pd.concat(
-                    st.session_state.augmented_data, ignore_index=True
                 )
-                if not augmented_df.empty:
-                    augmented_data_jsonl = "\n".join(
                         [
                             json.dumps(row.to_dict())
-                            for _, row in augmented_df.iterrows()
                         ]
                     )
                 else:
-                    augmented_data_jsonl = None
             else:
-                augmented_data_jsonl = None
             # Create a temporary directory to store the files
             with tempfile.TemporaryDirectory() as temp_dir:
@@ -556,26 +576,28 @@ def main():
                 with open(log_file_path, "w") as log_file:
                     log_file.write(log_jsonl)
-                # Write the augmented data to a file if it exists
-                if augmented_data_jsonl:
-                    augmented_data_file_path = os.path.join(
                         temp_dir, "synthetic_data.jsonl"
                     )
-                    with open(augmented_data_file_path, "w") as augmented_data_file:
-                        augmented_data_file.write(augmented_data_jsonl)
                 # Write the SDK code to a file
-                sdk_file_path = os.path.join(temp_dir, "data_augmentation_code.py")
                 with open(sdk_file_path, "w") as sdk_file:
                     sdk_file.write(config_text)
-                # Create a ZIP file containing the logs, augmented data, and SDK code
-                zip_file_path = os.path.join(temp_dir, "augmentation_results.zip")
                 with zipfile.ZipFile(zip_file_path, "w") as zip_file:
                     zip_file.write(log_file_path, "complete_logs.jsonl")
-                    if augmented_data_jsonl:
-                        zip_file.write(augmented_data_file_path, "augmented_data.jsonl")
-                    zip_file.write(sdk_file_path, "data_augmentation_code.py")
                 # Download the ZIP file
                 with open(zip_file_path, "rb") as zip_file:

 import streamlit as st
 from datasets import load_dataset
 from gretel_client import Gretel
+from navigator_helpers import (
+    DataSynthesisConfig,
+    TrainingDataSynthesizer,
+    StreamlitLogHandler,
+)
 # Create a StringIO buffer to capture the logging output
 log_buffer = StringIO()
         )
         if "gretel" not in st.session_state:
             st.session_state.gretel = None
+        if "synthesized_data" not in st.session_state:
+            st.session_state.synthesized_data = []
         if st.button("Validate API Key"):
             if api_key:
                 try:
             st.markdown("---")
             st.markdown("### Format Prompts")
+            st.markdown("---")
+            st.markdown("### Format Prompts")
+            system_prompt = st.text_area(
+                "System Prompt",
+                value=st.session_state.get(
+                    "system_prompt",
+                    "You are an AI assistant tasked with generating high-quality instruction-response pairs.\n"
+                    "Your goal is to create diverse, engaging, and informative content that covers a wide range of topics.\n"
+                    "When generating instructions, aim for clear, concise questions or commands that prompt thoughtful responses.\n"
+                    "When generating responses, provide detailed, accurate, and helpful information that directly addresses the instruction.",
+                ),
+                help="Specify the system prompt for the LLM",
+            )
+            st.session_state.system_prompt = system_prompt
             instruction_format_prompt = st.text_area(
                 "Instruction Format Prompt",
                 value=st.session_state.get(
             st.write(
                 "Get started with your current configuration using the SDK code below:"
             )
+            config_text = f"""#!pip install -Uqq git+https://github.com/gretelai/navigator-helpers.git
+import logging
+import pandas as pd
+from navigator_helpers import DataSynthesisConfig, TrainingDataSynthesizer
+# Configure the logger
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+DATASET = "YOUR_DATASET"
+API_KEY = "YOUR_API_KEY"
+df = pd.read_csv(DATASET)
+# Create the data synthesis configuration
+config = DataSynthesisConfig(
+    input_fields={st.session_state.selected_fields},
+    output_instruction_field="{output_instruction_field}",
+    output_response_field="{output_response_field}",
+    num_instructions={num_instructions},
+    num_responses={num_responses},
+    temperature={temperature},
+    max_tokens_instruction={max_tokens_instruction},
+    max_tokens_response={max_tokens_response},
+    api_key=API_KEY,
+    navigator_tabular="{navigator_tabular}",
+    navigator_llm="{navigator_llm}",
+    co_teach_llms={co_teach_llms},
+    system_prompt='''{system_prompt}''',
+    instruction_format_prompt='''{instruction_format_prompt}''',
+    response_format_prompt='''{response_format_prompt}'''
+)
+# Create the training data synthesizer and perform synthesis
+synthesizer = TrainingDataSynthesizer(
+    df,
+    config,
+    use_aaa={use_aaa},
+    output_file="results.csv",
+    verbose=True,
+)
+new_df = synthesizer.generate()"""
             st.code(config_text, language="python")
             st.download_button(
                 label="Download SDK Code",
                 data=config_text,
+                file_name="data_synthesis_code.py",
                 mime="text/plain",
             )
         if "logs" not in st.session_state:
             st.session_state.logs = []
+        if "synthetic_data" not in st.session_state:
+            st.session_state.synthetic_data = []
         if start_button:
+            # Clear the synthetic data and logs before starting a new generation
+            st.session_state.synthetic_data = []
             st.session_state.logs = []
             with st.expander("Synthetic Data", expanded=True):
                 st.subheader("Synthetic Data Generation")
                 progress_bar = st.progress(0)
+                tab1, tab2 = st.tabs(["synthetic Data", "Logs"])
                 with tab1:
+                    synthetic_data_placeholder = st.empty()
                     st.info(
                         "Click on the 'Logs' tab to see and debug real-time logging for each record as it is generated by the agents."
                     )
                 handler = StreamlitLogHandler(custom_log_handler)
                 logger.addHandler(handler)
+                config = DataSynthesisConfig(
                     input_fields=selected_fields,
                     output_instruction_field=output_instruction_field,
                     output_response_field=output_response_field,
                     navigator_tabular=navigator_tabular,
                     navigator_llm=navigator_llm,
                     co_teach_llms=co_teach_llms,
+                    system_prompt=system_prompt,
                     instruction_format_prompt=instruction_format_prompt,
                     response_format_prompt=response_format_prompt,
                 )
                 start_time = time.time()
                 with st.spinner("Generating synthetic data..."):
                     for index in range(num_records):
                         row = df.iloc[index]
+                        synthesizer = TrainingDataSynthesizer(
                             pd.DataFrame([row]),
                             config,
                             use_aaa=use_aaa,
                             output_file="results.csv",
                             verbose=True,
                         )
+                        new_df = synthesizer.generate()
+                        st.session_state.synthetic_data.append(new_df)
+                        synthetic_data_placeholder.subheader("Synthetic Data")
+                        synthetic_data_placeholder.dataframe(
                             pd.concat(
+                                st.session_state.synthetic_data, ignore_index=True
                             )
                         )
                         progress = (index + 1) / num_records
                         time.sleep(0.1)
                 logger.removeHandler(handler)
+                st.success("Data synthetic completed!")
             st.stop()
         if stop_button:
+            st.warning("Synthesis stopped by the user.")
             # Get the complete logs from the session state
             complete_logs = st.session_state.logs
             # Convert complete logs to JSONL format
             log_jsonl = "\n".join([json.dumps({"log": log}) for log in complete_logs])
+            # Convert synthesized data to JSONL format if it exists
+            if st.session_state.synthesized_data:
+                synthesized_df = pd.concat(
+                    st.session_state.synthesized_data, ignore_index=True
                 )
+                if not synthesized_df.empty:
+                    synthesized_data_jsonl = "\n".join(
                         [
                             json.dumps(row.to_dict())
+                            for _, row in synthesized_df.iterrows()
                         ]
                     )
                 else:
+                    synthesized_data_jsonl = None
             else:
+                synthesized_data_jsonl = None
             # Create a temporary directory to store the files
             with tempfile.TemporaryDirectory() as temp_dir:
                 with open(log_file_path, "w") as log_file:
                     log_file.write(log_jsonl)
+                # Write the synthesized data to a file if it exists
+                if synthesized_data_jsonl:
+                    synthesized_data_file_path = os.path.join(
                         temp_dir, "synthetic_data.jsonl"
                     )
+                    with open(synthesized_data_file_path, "w") as synthesized_data_file:
+                        synthesized_data_file.write(synthesized_data_jsonl)
                 # Write the SDK code to a file
+                sdk_file_path = os.path.join(temp_dir, "data_synthesis_code.py")
                 with open(sdk_file_path, "w") as sdk_file:
                     sdk_file.write(config_text)
+                # Create a ZIP file containing the logs, synthesized data, and SDK code
+                zip_file_path = os.path.join(temp_dir, "synthesis_results.zip")
                 with zipfile.ZipFile(zip_file_path, "w") as zip_file:
                     zip_file.write(log_file_path, "complete_logs.jsonl")
+                    if synthesized_data_jsonl:
+                        zip_file.write(
+                            synthesized_data_file_path, "synthesized_data.jsonl"
+                        )
+                    zip_file.write(sdk_file_path, "data_synthesis_code.py")
                 # Download the ZIP file
                 with open(zip_file_path, "rb") as zip_file: