from datasets import load_dataset import pandas as pd def get_data(sample_size): dataset = load_dataset("esnli") df = dataset['train'].to_pandas() esnli_train_df = df.dropna(subset=['hypothesis', 'explanation_1']) prompt_template = """You are an advanced AI trained to understand and explain natural language relationships. I will give you a pair of sentences: a premise and a hypothesis. Your task is to determine the relationship between them and provide a detailed explanation of your reasoning process. The possible relationships are "Entailment," "Contradiction," or "Neutral." Instructions: Read the given premise and hypothesis carefully. Identify the relationship between them based on the following definitions: Entailment: The hypothesis logically follows from the premise. Contradiction: The hypothesis directly contradicts the premise. Neutral: The hypothesis neither logically follows from nor contradicts the premise. Provide the relationship (Entailment, Contradiction, or Neutral). Explain in about ten words your reasoning to justify your conclusion. Example: Premise: "A man is playing a guitar." Hypothesis: "A man is making music." Relationship: Entailment Explanation: Playing guitar inherently involves creating music, fulfilling the hypothesis. Now, try it with the following pair: Premise: "{premise}" Hypothesis: "{hypothesis}" Relationship: """ # Generate prompts for the dataset def generate_prompts(df): prompts = [] for _, row in df.iterrows(): prompt = prompt_template.format(premise=row['premise'], hypothesis=row['hypothesis']) prompts.append({ 'question': prompt, 'answer': {0: 'Entailment', 1: 'Neutral', 2: 'Contradiction'}[row['label']], 'reference_explanation': row['explanation_1'] }) return prompts sample_df = esnli_train_df.sample(n=sample_size, random_state=42) prompts_data = generate_prompts(sample_df) prompts_df = pd.DataFrame(prompts_data) return prompts_df if __name__ == '__main__': sample_size = 5 print(get_data(sample_size))