gen-synth-data / domain.py
ignacioct's picture
recommiting all files
8773ff3
import json
from typing import Any, Dict, List
from distilabel.steps.tasks.typing import ChatType
from distilabel.steps.tasks.text_generation import TextGeneration
from distilabel.steps import StepInput, StepOutput, Step
from dotenv import load_dotenv
from defaults import (
DEFAULT_DOMAIN,
DEFAULT_PERSPECTIVES,
DEFAULT_TOPICS,
DEFAULT_EXAMPLES,
DEFAULT_SYSTEM_PROMPT,
N_PERSPECTIVES,
N_TOPICS,
N_EXAMPLES,
)
load_dotenv()
# Application description used for SelfInstruct
APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}.
Your should not expect basic but profound questions from your users.
The queries should reflect a diversity of vision and economic positions and political positions.
The queries may know about different methods of {DEFAULT_DOMAIN}.
The queries can be positioned politically, economically, socially, or practically.
Also take into account the impact of diverse causes on diverse domains."""
TOPICS = DEFAULT_TOPICS[:N_TOPICS]
PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES]
EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES]
def create_examples_template(examples: List[Dict[str, str]]) -> List[str]:
questions = """ Examples of high quality questions:"""
answers = """ Examples of high quality answers:"""
for example in examples:
questions += f"""\n- Question: {example["question"]}\n"""
answers += f"""\n- Answer: {example["answer"]}\n"""
_template: str = (
"""{instruction}\nThis is the the instruction.\n Examples: """
+ questions
+ answers
)
return _template
def create_topics(topics: List[str], positions: List[str]) -> List[str]:
return [
f"{topic} from a {position} perspective"
for topic in topics
for position in positions
]
class DomainExpert(TextGeneration):
"""A customized task to generate text as a domain expert in the domain of farming and agriculture."""
_system_prompt: (str) = DEFAULT_SYSTEM_PROMPT
_template: str = """{instruction}\nThis is the the instruction.\n Examples: """
def format_input(self, input: Dict[str, Any]) -> "ChatType":
return [
{
"role": "system",
"content": self._system_prompt,
},
{
"role": "user",
"content": self._template.format(**input),
},
]
class CleanNumberedList(Step):
"""A step to clean the numbered list of questions."""
def process(self, inputs: StepInput) -> StepOutput:
import re
pattern = r"^\d+\.\s"
for input in inputs:
input["question"] = re.sub(pattern, "", input["question"])
yield inputs