Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	
		Ben Burtenshaw
		
	commited on
		
		
					Commit 
							
							·
						
						fc828f1
	
1
								Parent(s):
							
							dfd3683
								
run pipeline locally
Browse files- __pycache__/defaults.cpython-311.pyc +0 -0
 - __pycache__/domain.cpython-311.pyc +0 -0
 - __pycache__/hub.cpython-311.pyc +0 -0
 - __pycache__/infer.cpython-311.pyc +0 -0
 - __pycache__/pipeline.cpython-311.pyc +0 -0
 - __pycache__/utils.cpython-311.pyc +0 -0
 - pages/2_👩🏼🔬 Describe Domain.py +3 -1
 - pages/3_🌱 Generate Dataset.py +79 -37
 - pipeline.yaml +9 -33
 - pipeline_params.json +0 -0
 - utils.py +30 -3
 
    	
        __pycache__/defaults.cpython-311.pyc
    ADDED
    
    | 
         Binary file (2.32 kB). View file 
     | 
| 
         | 
    	
        __pycache__/domain.cpython-311.pyc
    ADDED
    
    | 
         Binary file (4.53 kB). View file 
     | 
| 
         | 
    	
        __pycache__/hub.cpython-311.pyc
    ADDED
    
    | 
         Binary file (5.78 kB). View file 
     | 
| 
         | 
    	
        __pycache__/infer.cpython-311.pyc
    ADDED
    
    | 
         Binary file (837 Bytes). View file 
     | 
| 
         | 
    	
        __pycache__/pipeline.cpython-311.pyc
    ADDED
    
    | 
         Binary file (8.2 kB). View file 
     | 
| 
         | 
    	
        __pycache__/utils.cpython-311.pyc
    ADDED
    
    | 
         Binary file (4.93 kB). View file 
     | 
| 
         | 
    	
        pages/2_👩🏼🔬 Describe Domain.py
    CHANGED
    
    | 
         @@ -11,7 +11,7 @@ from defaults import ( 
     | 
|
| 11 | 
         
             
                PIPELINE_PATH,
         
     | 
| 12 | 
         
             
                DATASET_REPO_ID,
         
     | 
| 13 | 
         
             
            )
         
     | 
| 14 | 
         
            -
            from utils import project_sidebar
         
     | 
| 15 | 
         | 
| 16 | 
         | 
| 17 | 
         
             
            st.set_page_config(
         
     | 
| 
         @@ -212,6 +212,8 @@ domain_data = { 
     | 
|
| 212 | 
         
             
                "topics": topics,
         
     | 
| 213 | 
         
             
                "examples": examples,
         
     | 
| 214 | 
         
             
                "domain_expert_prompt": domain_expert_prompt,
         
     | 
| 
         | 
|
| 
         | 
|
| 215 | 
         
             
            }
         
     | 
| 216 | 
         | 
| 217 | 
         
             
            with open(SEED_DATA_PATH, "w") as f:
         
     | 
| 
         | 
|
| 11 | 
         
             
                PIPELINE_PATH,
         
     | 
| 12 | 
         
             
                DATASET_REPO_ID,
         
     | 
| 13 | 
         
             
            )
         
     | 
| 14 | 
         
            +
            from utils import project_sidebar, create_seed_terms, create_application_instruction
         
     | 
| 15 | 
         | 
| 16 | 
         | 
| 17 | 
         
             
            st.set_page_config(
         
     | 
| 
         | 
|
| 212 | 
         
             
                "topics": topics,
         
     | 
| 213 | 
         
             
                "examples": examples,
         
     | 
| 214 | 
         
             
                "domain_expert_prompt": domain_expert_prompt,
         
     | 
| 215 | 
         
            +
                "application_instruction": create_application_instruction(domain, examples),
         
     | 
| 216 | 
         
            +
                "seed_terms": create_seed_terms(topics, perspectives),
         
     | 
| 217 | 
         
             
            }
         
     | 
| 218 | 
         | 
| 219 | 
         
             
            with open(SEED_DATA_PATH, "w") as f:
         
     | 
    	
        pages/3_🌱 Generate Dataset.py
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 1 | 
         
             
            import streamlit as st
         
     | 
| 2 | 
         | 
| 3 | 
         
             
            from defaults import ARGILLA_URL
         
     | 
| 4 | 
         
            -
            from hub import push_pipeline_params 
     | 
| 5 | 
         
             
            from utils import project_sidebar
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            st.set_page_config(
         
     | 
| 
         @@ -20,16 +20,27 @@ st.divider() 
     | 
|
| 20 | 
         
             
            st.subheader("Step 3. Run the pipeline to generate synthetic data")
         
     | 
| 21 | 
         
             
            st.write("Define the distilabel pipeline for generating the dataset.")
         
     | 
| 22 | 
         | 
| 23 | 
         
            -
            ###############################################################
         
     | 
| 24 | 
         
            -
            # CONFIGURATION
         
     | 
| 25 | 
         
            -
            ###############################################################
         
     | 
| 26 | 
         
            -
             
     | 
| 27 | 
         
             
            hub_username = st.session_state.get("hub_username")
         
     | 
| 28 | 
         
             
            project_name = st.session_state.get("project_name")
         
     | 
| 29 | 
         
             
            hub_token = st.session_state.get("hub_token")
         
     | 
| 30 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 31 | 
         
             
            st.divider()
         
     | 
| 32 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 33 | 
         
             
            st.markdown("#### 🤖 Inference configuration")
         
     | 
| 34 | 
         | 
| 35 | 
         
             
            st.write(
         
     | 
| 
         @@ -43,13 +54,19 @@ with st.expander("🤗 Recommended Models"): 
     | 
|
| 43 | 
         
             
                    "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
         
     | 
| 44 | 
         
             
                )
         
     | 
| 45 | 
         
             
                st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
         
     | 
| 46 | 
         
            -
                st.code( 
     | 
| 
         | 
|
| 
         | 
|
| 47 | 
         | 
| 48 | 
         
             
                st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
         
     | 
| 49 | 
         
            -
                st.code( 
     | 
| 
         | 
|
| 
         | 
|
| 50 | 
         | 
| 51 | 
         
            -
                st.write("🍃Projects with even less resources could  
     | 
| 52 | 
         
            -
                st.code( 
     | 
| 
         | 
|
| 
         | 
|
| 53 | 
         | 
| 54 | 
         
             
                st.write("Note Hugggingface Pro gives access to more compute resources")
         
     | 
| 55 | 
         
             
                st.link_button(
         
     | 
| 
         @@ -58,10 +75,27 @@ with st.expander("🤗 Recommended Models"): 
     | 
|
| 58 | 
         
             
                )
         
     | 
| 59 | 
         | 
| 60 | 
         | 
| 61 | 
         
            -
             
     | 
| 62 | 
         
            -
                label=" 
     | 
| 63 | 
         
            -
                value="https://api-inference.huggingface.co/models/ 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 64 | 
         
             
            )
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 65 | 
         
             
            st.divider()
         
     | 
| 66 | 
         
             
            st.markdown("#### 🔬 Argilla API details to push the generated dataset")
         
     | 
| 67 | 
         
             
            argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
         
     | 
| 
         @@ -84,30 +118,38 @@ if all( 
     | 
|
| 84 | 
         
             
                [
         
     | 
| 85 | 
         
             
                    argilla_api_key,
         
     | 
| 86 | 
         
             
                    argilla_url,
         
     | 
| 87 | 
         
            -
                     
     | 
| 88 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 89 | 
         
             
                    project_name,
         
     | 
| 90 | 
         
             
                    hub_token,
         
     | 
| 91 | 
         
             
                    argilla_dataset_name,
         
     | 
| 92 | 
         
             
                ]
         
     | 
| 93 | 
         
            -
            ):
         
     | 
| 94 | 
         
            -
                 
     | 
| 95 | 
         
            -
                     
     | 
| 96 | 
         
            -
                         
     | 
| 97 | 
         
            -
             
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
             
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
             
     | 
| 102 | 
         
            -
             
     | 
| 103 | 
         
            -
             
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
             
     | 
| 106 | 
         
            -
             
     | 
| 107 | 
         
            -
             
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 111 | 
         
             
                )
         
     | 
| 112 | 
         | 
| 113 | 
         
             
                st.markdown(
         
     | 
| 
         @@ -118,7 +160,7 @@ if all( 
     | 
|
| 118 | 
         
             
                    f"""
         
     | 
| 119 | 
         | 
| 120 | 
         
             
                    # Install the distilabel library
         
     | 
| 121 | 
         
            -
                    pip install  
     | 
| 122 | 
         
             
                    """
         
     | 
| 123 | 
         
             
                )
         
     | 
| 124 | 
         | 
| 
         @@ -126,8 +168,8 @@ if all( 
     | 
|
| 126 | 
         | 
| 127 | 
         
             
                st.code(
         
     | 
| 128 | 
         
             
                    f"""
         
     | 
| 129 | 
         
            -
                    git clone https:// 
     | 
| 130 | 
         
            -
                    cd  
     | 
| 131 | 
         
             
                    pip install -r requirements.txt
         
     | 
| 132 | 
         
             
                    """
         
     | 
| 133 | 
         
             
                )
         
     | 
| 
         @@ -135,9 +177,9 @@ if all( 
     | 
|
| 135 | 
         
             
                st.markdown("Finally, you can run the pipeline using the following command:")
         
     | 
| 136 | 
         | 
| 137 | 
         
             
                st.code(
         
     | 
| 138 | 
         
            -
                    """
         
     | 
| 139 | 
         
             
                    huggingface-cli login
         
     | 
| 140 | 
         
            -
                    python  
     | 
| 141 | 
         
             
                    language="bash",
         
     | 
| 142 | 
         
             
                )
         
     | 
| 143 | 
         
             
                st.markdown(
         
     | 
| 
         | 
|
| 1 | 
         
             
            import streamlit as st
         
     | 
| 2 | 
         | 
| 3 | 
         
             
            from defaults import ARGILLA_URL
         
     | 
| 4 | 
         
            +
            from hub import push_pipeline_params
         
     | 
| 5 | 
         
             
            from utils import project_sidebar
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            st.set_page_config(
         
     | 
| 
         | 
|
| 20 | 
         
             
            st.subheader("Step 3. Run the pipeline to generate synthetic data")
         
     | 
| 21 | 
         
             
            st.write("Define the distilabel pipeline for generating the dataset.")
         
     | 
| 22 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 23 | 
         
             
            hub_username = st.session_state.get("hub_username")
         
     | 
| 24 | 
         
             
            project_name = st.session_state.get("project_name")
         
     | 
| 25 | 
         
             
            hub_token = st.session_state.get("hub_token")
         
     | 
| 26 | 
         | 
| 27 | 
         
            +
            ###############################################################
         
     | 
| 28 | 
         
            +
            # CONFIGURATION
         
     | 
| 29 | 
         
            +
            ###############################################################
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
             
            st.divider()
         
     | 
| 32 | 
         | 
| 33 | 
         
            +
            st.markdown("## 🧰 Pipeline Configuration")
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            st.write(
         
     | 
| 36 | 
         
            +
                "Now we need to define the configuration for the pipeline that will generate the synthetic data."
         
     | 
| 37 | 
         
            +
            )
         
     | 
| 38 | 
         
            +
            st.write(
         
     | 
| 39 | 
         
            +
                "⚠️ Model and parameter choice significantly affect the quality of the generated data. \
         
     | 
| 40 | 
         
            +
                We reccomend that you start with a few samples and review the data. The scale up from there."
         
     | 
| 41 | 
         
            +
            )
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
             
            st.markdown("#### 🤖 Inference configuration")
         
     | 
| 45 | 
         | 
| 46 | 
         
             
            st.write(
         
     | 
| 
         | 
|
| 54 | 
         
             
                    "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
         
     | 
| 55 | 
         
             
                )
         
     | 
| 56 | 
         
             
                st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
         
     | 
| 57 | 
         
            +
                st.code(
         
     | 
| 58 | 
         
            +
                    "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
         
     | 
| 59 | 
         
            +
                )
         
     | 
| 60 | 
         | 
| 61 | 
         
             
                st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
         
     | 
| 62 | 
         
            +
                st.code(
         
     | 
| 63 | 
         
            +
                    "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
         
     | 
| 64 | 
         
            +
                )
         
     | 
| 65 | 
         | 
| 66 | 
         
            +
                st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
         
     | 
| 67 | 
         
            +
                st.code(
         
     | 
| 68 | 
         
            +
                    "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
         
     | 
| 69 | 
         
            +
                )
         
     | 
| 70 | 
         | 
| 71 | 
         
             
                st.write("Note Hugggingface Pro gives access to more compute resources")
         
     | 
| 72 | 
         
             
                st.link_button(
         
     | 
| 
         | 
|
| 75 | 
         
             
                )
         
     | 
| 76 | 
         | 
| 77 | 
         | 
| 78 | 
         
            +
            self_instruct_base_url = st.text_input(
         
     | 
| 79 | 
         
            +
                label="Model base URL for instruction generation",
         
     | 
| 80 | 
         
            +
                value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
         
     | 
| 81 | 
         
            +
            )
         
     | 
| 82 | 
         
            +
            domain_expert_base_url = st.text_input(
         
     | 
| 83 | 
         
            +
                label="Model base URL for domain expert response",
         
     | 
| 84 | 
         
            +
                value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
         
     | 
| 85 | 
         
            +
            )
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
            st.divider()
         
     | 
| 88 | 
         
            +
            st.markdown("#### 🧮 Parameters configuration")
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
            self_intruct_num_generations = st.slider(
         
     | 
| 91 | 
         
            +
                "Number of generations for self-instruction", 1, 10, 2
         
     | 
| 92 | 
         
             
            )
         
     | 
| 93 | 
         
            +
            domain_expert_num_generations = st.slider(
         
     | 
| 94 | 
         
            +
                "Number of generations for domain expert", 1, 10, 2
         
     | 
| 95 | 
         
            +
            )
         
     | 
| 96 | 
         
            +
            self_instruct_temperature = st.slider("Temperature for self-instruction", 0.1, 1.0, 0.9)
         
     | 
| 97 | 
         
            +
            domain_expert_temperature = st.slider("Temperature for domain expert", 0.1, 1.0, 0.9)
         
     | 
| 98 | 
         
            +
             
     | 
| 99 | 
         
             
            st.divider()
         
     | 
| 100 | 
         
             
            st.markdown("#### 🔬 Argilla API details to push the generated dataset")
         
     | 
| 101 | 
         
             
            argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
         
     | 
| 
         | 
|
| 118 | 
         
             
                [
         
     | 
| 119 | 
         
             
                    argilla_api_key,
         
     | 
| 120 | 
         
             
                    argilla_url,
         
     | 
| 121 | 
         
            +
                    self_instruct_base_url,
         
     | 
| 122 | 
         
            +
                    domain_expert_base_url,
         
     | 
| 123 | 
         
            +
                    self_intruct_num_generations,
         
     | 
| 124 | 
         
            +
                    domain_expert_num_generations,
         
     | 
| 125 | 
         
            +
                    self_instruct_temperature,
         
     | 
| 126 | 
         
            +
                    domain_expert_temperature,
         
     | 
| 127 | 
         
            +
                    hub_username,
         
     | 
| 128 | 
         
             
                    project_name,
         
     | 
| 129 | 
         
             
                    hub_token,
         
     | 
| 130 | 
         
             
                    argilla_dataset_name,
         
     | 
| 131 | 
         
             
                ]
         
     | 
| 132 | 
         
            +
            ) and st.button("💾 Save Pipeline Config"):
         
     | 
| 133 | 
         
            +
                with st.spinner("Pushing pipeline to the Hub..."):
         
     | 
| 134 | 
         
            +
                    push_pipeline_params(
         
     | 
| 135 | 
         
            +
                        pipeline_params={
         
     | 
| 136 | 
         
            +
                            "argilla_api_key": argilla_api_key,
         
     | 
| 137 | 
         
            +
                            "argilla_api_url": argilla_url,
         
     | 
| 138 | 
         
            +
                            "argilla_dataset_name": argilla_dataset_name,
         
     | 
| 139 | 
         
            +
                            "self_instruct_base_url": self_instruct_base_url,
         
     | 
| 140 | 
         
            +
                            "domain_expert_base_url": domain_expert_base_url,
         
     | 
| 141 | 
         
            +
                            "self_instruct_temperature": self_instruct_temperature,
         
     | 
| 142 | 
         
            +
                            "domain_expert_temperature": domain_expert_temperature,
         
     | 
| 143 | 
         
            +
                            "self_intruct_num_generations": self_intruct_num_generations,
         
     | 
| 144 | 
         
            +
                            "domain_expert_num_generations": domain_expert_num_generations,
         
     | 
| 145 | 
         
            +
                        },
         
     | 
| 146 | 
         
            +
                        hub_username=hub_username,
         
     | 
| 147 | 
         
            +
                        hub_token=hub_token,
         
     | 
| 148 | 
         
            +
                        project_name=project_name,
         
     | 
| 149 | 
         
            +
                    )
         
     | 
| 150 | 
         
            +
             
     | 
| 151 | 
         
            +
                st.success(
         
     | 
| 152 | 
         
            +
                    f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
         
     | 
| 153 | 
         
             
                )
         
     | 
| 154 | 
         | 
| 155 | 
         
             
                st.markdown(
         
     | 
| 
         | 
|
| 160 | 
         
             
                    f"""
         
     | 
| 161 | 
         | 
| 162 | 
         
             
                    # Install the distilabel library
         
     | 
| 163 | 
         
            +
                    pip install distilabel
         
     | 
| 164 | 
         
             
                    """
         
     | 
| 165 | 
         
             
                )
         
     | 
| 166 | 
         | 
| 
         | 
|
| 168 | 
         | 
| 169 | 
         
             
                st.code(
         
     | 
| 170 | 
         
             
                    f"""
         
     | 
| 171 | 
         
            +
                    git clone https://github.com/huggingface/data-is-better-together
         
     | 
| 172 | 
         
            +
                    cd data-is-better-together/domain-specific-datasets/pipelines
         
     | 
| 173 | 
         
             
                    pip install -r requirements.txt
         
     | 
| 174 | 
         
             
                    """
         
     | 
| 175 | 
         
             
                )
         
     | 
| 
         | 
|
| 177 | 
         
             
                st.markdown("Finally, you can run the pipeline using the following command:")
         
     | 
| 178 | 
         | 
| 179 | 
         
             
                st.code(
         
     | 
| 180 | 
         
            +
                    f"""
         
     | 
| 181 | 
         
             
                    huggingface-cli login
         
     | 
| 182 | 
         
            +
                    python domain_expert_pipeline.py {hub_username}/{project_name}""",
         
     | 
| 183 | 
         
             
                    language="bash",
         
     | 
| 184 | 
         
             
                )
         
     | 
| 185 | 
         
             
                st.markdown(
         
     | 
    	
        pipeline.yaml
    CHANGED
    
    | 
         @@ -1,5 +1,5 @@ 
     | 
|
| 1 | 
         
             
            distilabel:
         
     | 
| 2 | 
         
            -
              version: 1.0. 
     | 
| 3 | 
         
             
            pipeline:
         
     | 
| 4 | 
         
             
              name: farming
         
     | 
| 5 | 
         
             
              description: null
         
     | 
| 
         @@ -10,31 +10,7 @@ pipeline: 
     | 
|
| 10 | 
         
             
                  output_mappings: {}
         
     | 
| 11 | 
         
             
                  batch_size: 64
         
     | 
| 12 | 
         
             
                  data:
         
     | 
| 13 | 
         
            -
                  - input:  
     | 
| 14 | 
         
            -
                  - input: animal welfare from a Agribusiness perspective
         
     | 
| 15 | 
         
            -
                  - input: animal welfare from a Permaculture perspective
         
     | 
| 16 | 
         
            -
                  - input: animal welfare from a Agroforestery perspective
         
     | 
| 17 | 
         
            -
                  - input: animal welfare from a Conventional Farming perspective
         
     | 
| 18 | 
         
            -
                  - input: economic growth from a Family Farming perspective
         
     | 
| 19 | 
         
            -
                  - input: economic growth from a Agribusiness perspective
         
     | 
| 20 | 
         
            -
                  - input: economic growth from a Permaculture perspective
         
     | 
| 21 | 
         
            -
                  - input: economic growth from a Agroforestery perspective
         
     | 
| 22 | 
         
            -
                  - input: economic growth from a Conventional Farming perspective
         
     | 
| 23 | 
         
            -
                  - input: land from a Family Farming perspective
         
     | 
| 24 | 
         
            -
                  - input: land from a Agribusiness perspective
         
     | 
| 25 | 
         
            -
                  - input: land from a Permaculture perspective
         
     | 
| 26 | 
         
            -
                  - input: land from a Agroforestery perspective
         
     | 
| 27 | 
         
            -
                  - input: land from a Conventional Farming perspective
         
     | 
| 28 | 
         
            -
                  - input: resources from a Family Farming perspective
         
     | 
| 29 | 
         
            -
                  - input: resources from a Agribusiness perspective
         
     | 
| 30 | 
         
            -
                  - input: resources from a Permaculture perspective
         
     | 
| 31 | 
         
            -
                  - input: resources from a Agroforestery perspective
         
     | 
| 32 | 
         
            -
                  - input: resources from a Conventional Farming perspective
         
     | 
| 33 | 
         
            -
                  - input: efficiency from a Family Farming perspective
         
     | 
| 34 | 
         
            -
                  - input: efficiency from a Agribusiness perspective
         
     | 
| 35 | 
         
            -
                  - input: efficiency from a Permaculture perspective
         
     | 
| 36 | 
         
            -
                  - input: efficiency from a Agroforestery perspective
         
     | 
| 37 | 
         
            -
                  - input: efficiency from a Conventional Farming perspective
         
     | 
| 38 | 
         
             
                  runtime_parameters_info:
         
     | 
| 39 | 
         
             
                  - name: batch_size
         
     | 
| 40 | 
         
             
                    optional: true
         
     | 
| 
         @@ -54,7 +30,7 @@ pipeline: 
     | 
|
| 54 | 
         
             
                    model_id: null
         
     | 
| 55 | 
         
             
                    endpoint_name: null
         
     | 
| 56 | 
         
             
                    endpoint_namespace: null
         
     | 
| 57 | 
         
            -
                    base_url: https:// 
     | 
| 58 | 
         
             
                    tokenizer_id: null
         
     | 
| 59 | 
         
             
                    model_display_name: null
         
     | 
| 60 | 
         
             
                    use_openai_client: false
         
     | 
| 
         @@ -75,14 +51,14 @@ pipeline: 
     | 
|
| 75 | 
         
             
                    Blend interrogative (e.g., "What is the significance of x?") and imperative
         
     | 
| 76 | 
         
             
                    (e.g., "Detail the process of x.") styles.'
         
     | 
| 77 | 
         
             
                  application_description: 'You are an AI assistant than generates queries around
         
     | 
| 78 | 
         
            -
                    the domain of  
     | 
| 79 | 
         | 
| 80 | 
         
             
                    Your should not expect basic but profound questions from your users.
         
     | 
| 81 | 
         | 
| 82 | 
         
             
                    The queries should reflect a diversity of vision and economic positions and
         
     | 
| 83 | 
         
             
                    political positions.
         
     | 
| 84 | 
         | 
| 85 | 
         
            -
                    The queries may know about different methods of  
     | 
| 86 | 
         | 
| 87 | 
         
             
                    The queries can be positioned politically, economically, socially, or practically.
         
     | 
| 88 | 
         | 
| 
         @@ -163,7 +139,7 @@ pipeline: 
     | 
|
| 163 | 
         
             
                    model_id: null
         
     | 
| 164 | 
         
             
                    endpoint_name: null
         
     | 
| 165 | 
         
             
                    endpoint_namespace: null
         
     | 
| 166 | 
         
            -
                    base_url: https:// 
     | 
| 167 | 
         
             
                    tokenizer_id: null
         
     | 
| 168 | 
         
             
                    model_display_name: null
         
     | 
| 169 | 
         
             
                    use_openai_client: false
         
     | 
| 
         @@ -390,7 +366,7 @@ pipeline: 
     | 
|
| 390 | 
         
             
                    model_id: null
         
     | 
| 391 | 
         
             
                    endpoint_name: null
         
     | 
| 392 | 
         
             
                    endpoint_namespace: null
         
     | 
| 393 | 
         
            -
                    base_url: https:// 
     | 
| 394 | 
         
             
                    tokenizer_id: null
         
     | 
| 395 | 
         
             
                    model_display_name: null
         
     | 
| 396 | 
         
             
                    use_openai_client: false
         
     | 
| 
         @@ -489,9 +465,9 @@ pipeline: 
     | 
|
| 489 | 
         
             
                    generation: domain_expert_answer
         
     | 
| 490 | 
         
             
                  output_mappings: {}
         
     | 
| 491 | 
         
             
                  input_batch_size: 50
         
     | 
| 492 | 
         
            -
                  dataset_name:  
     | 
| 493 | 
         
             
                  dataset_workspace: admin
         
     | 
| 494 | 
         
            -
                  api_url: https://argilla- 
     | 
| 495 | 
         
             
                  runtime_parameters_info:
         
     | 
| 496 | 
         
             
                  - name: input_batch_size
         
     | 
| 497 | 
         
             
                    optional: true
         
     | 
| 
         | 
|
| 1 | 
         
             
            distilabel:
         
     | 
| 2 | 
         
            +
              version: 1.0.1
         
     | 
| 3 | 
         
             
            pipeline:
         
     | 
| 4 | 
         
             
              name: farming
         
     | 
| 5 | 
         
             
              description: null
         
     | 
| 
         | 
|
| 10 | 
         
             
                  output_mappings: {}
         
     | 
| 11 | 
         
             
                  batch_size: 64
         
     | 
| 12 | 
         
             
                  data:
         
     | 
| 13 | 
         
            +
                  - input: punctures from a Retro bikes perspective
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 14 | 
         
             
                  runtime_parameters_info:
         
     | 
| 15 | 
         
             
                  - name: batch_size
         
     | 
| 16 | 
         
             
                    optional: true
         
     | 
| 
         | 
|
| 30 | 
         
             
                    model_id: null
         
     | 
| 31 | 
         
             
                    endpoint_name: null
         
     | 
| 32 | 
         
             
                    endpoint_namespace: null
         
     | 
| 33 | 
         
            +
                    base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
         
     | 
| 34 | 
         
             
                    tokenizer_id: null
         
     | 
| 35 | 
         
             
                    model_display_name: null
         
     | 
| 36 | 
         
             
                    use_openai_client: false
         
     | 
| 
         | 
|
| 51 | 
         
             
                    Blend interrogative (e.g., "What is the significance of x?") and imperative
         
     | 
| 52 | 
         
             
                    (e.g., "Detail the process of x.") styles.'
         
     | 
| 53 | 
         
             
                  application_description: 'You are an AI assistant than generates queries around
         
     | 
| 54 | 
         
            +
                    the domain of Bicycle maintenance.
         
     | 
| 55 | 
         | 
| 56 | 
         
             
                    Your should not expect basic but profound questions from your users.
         
     | 
| 57 | 
         | 
| 58 | 
         
             
                    The queries should reflect a diversity of vision and economic positions and
         
     | 
| 59 | 
         
             
                    political positions.
         
     | 
| 60 | 
         | 
| 61 | 
         
            +
                    The queries may know about different methods of Bicycle maintenance.
         
     | 
| 62 | 
         | 
| 63 | 
         
             
                    The queries can be positioned politically, economically, socially, or practically.
         
     | 
| 64 | 
         | 
| 
         | 
|
| 139 | 
         
             
                    model_id: null
         
     | 
| 140 | 
         
             
                    endpoint_name: null
         
     | 
| 141 | 
         
             
                    endpoint_namespace: null
         
     | 
| 142 | 
         
            +
                    base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
         
     | 
| 143 | 
         
             
                    tokenizer_id: null
         
     | 
| 144 | 
         
             
                    model_display_name: null
         
     | 
| 145 | 
         
             
                    use_openai_client: false
         
     | 
| 
         | 
|
| 366 | 
         
             
                    model_id: null
         
     | 
| 367 | 
         
             
                    endpoint_name: null
         
     | 
| 368 | 
         
             
                    endpoint_namespace: null
         
     | 
| 369 | 
         
            +
                    base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
         
     | 
| 370 | 
         
             
                    tokenizer_id: null
         
     | 
| 371 | 
         
             
                    model_display_name: null
         
     | 
| 372 | 
         
             
                    use_openai_client: false
         
     | 
| 
         | 
|
| 465 | 
         
             
                    generation: domain_expert_answer
         
     | 
| 466 | 
         
             
                  output_mappings: {}
         
     | 
| 467 | 
         
             
                  input_batch_size: 50
         
     | 
| 468 | 
         
            +
                  dataset_name: bicycle_maintenance
         
     | 
| 469 | 
         
             
                  dataset_workspace: admin
         
     | 
| 470 | 
         
            +
                  api_url: https://burtenshaw-bicycle-maintenance-argilla-space.hf.space
         
     | 
| 471 | 
         
             
                  runtime_parameters_info:
         
     | 
| 472 | 
         
             
                  - name: input_batch_size
         
     | 
| 473 | 
         
             
                    optional: true
         
     | 
    	
        pipeline_params.json
    ADDED
    
    | 
         
            File without changes
         
     | 
    	
        utils.py
    CHANGED
    
    | 
         @@ -1,13 +1,13 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 1 | 
         
             
            import streamlit as st
         
     | 
| 2 | 
         | 
| 3 | 
         
             
            from defaults import (
         
     | 
| 4 | 
         
            -
                ARGILLA_SPACE_REPO_ID,
         
     | 
| 5 | 
         
             
                PROJECT_NAME,
         
     | 
| 6 | 
         
             
                ARGILLA_URL,
         
     | 
| 7 | 
         
             
                DIBT_PARENT_APP_URL,
         
     | 
| 8 | 
         
             
                DATASET_URL,
         
     | 
| 9 | 
         
             
                DATASET_REPO_ID,
         
     | 
| 10 | 
         
            -
                ARGILLA_SPACE_REPO_ID,
         
     | 
| 11 | 
         
             
            )
         
     | 
| 12 | 
         | 
| 13 | 
         | 
| 
         @@ -48,8 +48,35 @@ def project_sidebar(): 
     | 
|
| 48 | 
         
             
                st.sidebar.divider()
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                st.sidebar.link_button("🧑🌾 New Project", DIBT_PARENT_APP_URL)
         
     | 
| 51 | 
         
            -
             
     | 
| 52 | 
         
             
                if st.session_state["hub_token"] is None:
         
     | 
| 53 | 
         
             
                    st.error("Please provide a Hub token to generate answers")
         
     | 
| 54 | 
         
             
                    st.stop()
         
     | 
| 55 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from textwrap import dedent
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
             
            import streamlit as st
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            from defaults import (
         
     | 
| 
         | 
|
| 6 | 
         
             
                PROJECT_NAME,
         
     | 
| 7 | 
         
             
                ARGILLA_URL,
         
     | 
| 8 | 
         
             
                DIBT_PARENT_APP_URL,
         
     | 
| 9 | 
         
             
                DATASET_URL,
         
     | 
| 10 | 
         
             
                DATASET_REPO_ID,
         
     | 
| 
         | 
|
| 11 | 
         
             
            )
         
     | 
| 12 | 
         | 
| 13 | 
         | 
| 
         | 
|
| 48 | 
         
             
                st.sidebar.divider()
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                st.sidebar.link_button("🧑🌾 New Project", DIBT_PARENT_APP_URL)
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
             
                if st.session_state["hub_token"] is None:
         
     | 
| 53 | 
         
             
                    st.error("Please provide a Hub token to generate answers")
         
     | 
| 54 | 
         
             
                    st.stop()
         
     | 
| 55 | 
         | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            def create_seed_terms(topics: list[str], perspectives: list[str]) -> list[str]:
         
     | 
| 58 | 
         
            +
                """Create seed terms for self intruct to start from."""
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
                return [
         
     | 
| 61 | 
         
            +
                    f"{topic} from a {perspective} perspective"
         
     | 
| 62 | 
         
            +
                    for topic in topics
         
     | 
| 63 | 
         
            +
                    for perspective in perspectives
         
     | 
| 64 | 
         
            +
                ]
         
     | 
| 65 | 
         
            +
             
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
            def create_application_instruction(domain: str, examples: list[dict[str, str]]) -> str:
         
     | 
| 68 | 
         
            +
                """Create the instruction for Self-Instruct task."""
         
     | 
| 69 | 
         
            +
                system_prompt = dedent(
         
     | 
| 70 | 
         
            +
                    f"""You are an AI assistant than generates queries around the domain of {domain}.
         
     | 
| 71 | 
         
            +
                        Your should not expect basic but profound questions from your users.
         
     | 
| 72 | 
         
            +
                        The queries should reflect a diversxamity of vision and economic positions and political positions.
         
     | 
| 73 | 
         
            +
                        The queries may know about different methods of {domain}.
         
     | 
| 74 | 
         
            +
                        The queries can be positioned politically, economically, socially, or practically.
         
     | 
| 75 | 
         
            +
                        Also take into account the impact of diverse causes on diverse domains."""
         
     | 
| 76 | 
         
            +
                )
         
     | 
| 77 | 
         
            +
                for example in examples:
         
     | 
| 78 | 
         
            +
                    question = example["question"]
         
     | 
| 79 | 
         
            +
                    answer = example["answer"]
         
     | 
| 80 | 
         
            +
                    system_prompt += f"""\n- Question: {question}\n- Answer: {answer}\n"""
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
                return system_prompt
         
     |