import json from typing import Any, Dict, List from distilabel.steps.tasks.typing import ChatType from distilabel.steps.tasks.text_generation import TextGeneration from distilabel.steps import StepInput, StepOutput, Step from dotenv import load_dotenv from defaults import ( DEFAULT_DOMAIN, DEFAULT_PERSPECTIVES, DEFAULT_TOPICS, DEFAULT_EXAMPLES, DEFAULT_SYSTEM_PROMPT, N_PERSPECTIVES, N_TOPICS, N_EXAMPLES, ) load_dotenv() # Application description used for SelfInstruct APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}. Your should not expect basic but profound questions from your users. The queries should reflect a diversity of vision and economic positions and political positions. The queries may know about different methods of {DEFAULT_DOMAIN}. The queries can be positioned politically, economically, socially, or practically. Also take into account the impact of diverse causes on diverse domains.""" TOPICS = DEFAULT_TOPICS[:N_TOPICS] PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES] EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES] def create_examples_template(examples: List[Dict[str, str]]) -> List[str]: questions = """ Examples of high quality questions:""" answers = """ Examples of high quality answers:""" for example in examples: questions += f"""\n- Question: {example["question"]}\n""" answers += f"""\n- Answer: {example["answer"]}\n""" _template: str = ( """{instruction}\nThis is the the instruction.\n Examples: """ + questions + answers ) return _template def create_topics(topics: List[str], positions: List[str]) -> List[str]: return [ f"{topic} from a {position} perspective" for topic in topics for position in positions ] class DomainExpert(TextGeneration): """A customized task to generate text as a domain expert in the domain of farming and agriculture.""" _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT _template: str = """{instruction}\nThis is the the instruction.\n Examples: """ def format_input(self, input: Dict[str, Any]) -> "ChatType": return [ { "role": "system", "content": self._system_prompt, }, { "role": "user", "content": self._template.format(**input), }, ] class CleanNumberedList(Step): """A step to clean the numbered list of questions.""" def process(self, inputs: StepInput) -> StepOutput: import re pattern = r"^\d+\.\s" for input in inputs: input["question"] = re.sub(pattern, "", input["question"]) yield inputs