from datasets import load_dataset
import pandas as pd


def get_data(sample_size):
    dataset = load_dataset("esnli")
    df = dataset['train'].to_pandas()

    esnli_train_df = df.dropna(subset=['hypothesis', 'explanation_1'])

    prompt_template = """You are an advanced AI trained to understand and explain natural language relationships. I will give you a pair of sentences: a premise and a hypothesis. Your task is to determine the relationship between them and provide a detailed explanation of your reasoning process. The possible relationships are "Entailment," "Contradiction," or "Neutral."
    
    Instructions:
    
    Read the given premise and hypothesis carefully.
    
    Identify the relationship between them based on the following definitions:
    
    Entailment: The hypothesis logically follows from the premise.
    Contradiction: The hypothesis directly contradicts the premise.
    Neutral: The hypothesis neither logically follows from nor contradicts the premise.
    
    Provide the relationship (Entailment, Contradiction, or Neutral).
    
    Explain in about ten words your reasoning to justify your conclusion.
    
    Example:
    
    Premise: "A man is playing a guitar."
    Hypothesis: "A man is making music."
    Relationship: Entailment
    Explanation: Playing guitar inherently involves creating music, fulfilling the hypothesis.
    
    Now, try it with the following pair:
    
    Premise: "{premise}"
    Hypothesis: "{hypothesis}"
    Relationship:
    """

    # Generate prompts for the dataset
    def generate_prompts(df):
        prompts = []
        for _, row in df.iterrows():
            prompt = prompt_template.format(premise=row['premise'], hypothesis=row['hypothesis'])
            prompts.append({
                'question': prompt,
                'answer': {0: 'Entailment', 1: 'Neutral', 2: 'Contradiction'}[row['label']],
                'reference_explanation': row['explanation_1']
            })
        return prompts

    sample_df = esnli_train_df.sample(n=sample_size, random_state=42)
    prompts_data = generate_prompts(sample_df)

    prompts_df = pd.DataFrame(prompts_data)

    return prompts_df

if __name__ == '__main__':
    sample_size = 5
    print(get_data(sample_size))