Spaces:

acmc
/

whatsapp-chats-finetuning-formatter

Running

App Files Files Community

ACMC commited on Mar 7

Commit

7e73556

•

0 Parent(s):

initial commit

Browse files

Files changed (7) hide show

.gitattributes +35 -0
.gitignore +2 -0
README.md +12 -0
app.py +204 -0
requirements.txt +13 -0
utils.py +342 -0
validation.py +174 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.jsonl
2	+ __pycache__/

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Whatsapp Chats Finetuning Formatter
+emoji: 👀
+colorFrom: pink
+colorTo: yellow
+sdk: gradio
+sdk_version: 4.20.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# %%
+from uuid import uuid4
+import gradio as gr
+import datasets
+import json
+import io
+from utils import (
+    process_chat_file,
+    transform_conversations_dataset_into_training_examples,
+)
+from validation import (
+    check_format_errors,
+    check_token_counts,
+    estimate_cost,
+    get_distributions,
+)
+import matplotlib.pyplot as plt
+def convert_to_dataset(files, do_spelling_correction, progress):
+    modified_dataset = None
+    for file in progress.tqdm(files, desc="Processing files"):
+        if modified_dataset is None:
+            # First file
+            modified_dataset = process_chat_file(file, do_spelling_correction=do_spelling_correction)
+        else:
+            # Concatenate the datasets
+            this_file_dataset = process_chat_file(file, do_spelling_correction=do_spelling_correction)
+            modified_dataset = datasets.concatenate_datasets(
+                [modified_dataset, this_file_dataset]
+            )
+    return modified_dataset
+def file_upload_callback(files, system_prompt, do_spelling_correction, validation_split, progress=gr.Progress()):
+    print(f"Processing {files}")
+    full_system_prompt = f"""You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
+# Task
+A participant can send multiple messages in a row, delimited by '\"', in the following schema:
+{{string}}[]. Your answer always needs to be JSON compliant. Always start your answer with [\"
+# Information about me
+You should use the following information about me to answer:
+{system_prompt}
+# Example
+[{{\"role\":\"user\",\"content\":\"[\"Hello!\",\"How are you?\"]\"}},{{\"role\":\"assistant\",\"content\":\"[\"Hi!\",\"I'm doing great.\",\"What about you?\"]\"}},{{\"role\":\"user\",\"content\":\"[\"I'm doing well.\",\"Have you been travelling?\"]\"}}]
+Response:
+[{{\"role\":\"assistant\",\"content\":\"[\"Yes, I've been to many places.\",\"I love travelling.\"]\"}}]"""
+    # Avoid using the full system prompt for now, as it is too long and increases the cost of the training
+    full_system_prompt = system_prompt
+    dataset = convert_to_dataset(files=files, progress=progress, do_spelling_correction=do_spelling_correction)
+    training_examples_ds = transform_conversations_dataset_into_training_examples(
+        conversations_ds=dataset, system_prompt=full_system_prompt
+    )
+    # Split into training and validation datasets (80% and 20%)
+    training_examples_ds = training_examples_ds.train_test_split(test_size=validation_split, seed=42)
+    training_examples_ds, validation_examples_ds = training_examples_ds["train"], training_examples_ds["test"]
+    format_errors = check_format_errors(training_examples_ds)
+    distributions = get_distributions(training_examples_ds)
+    cost_stats = estimate_cost(training_examples_ds)
+    stats = {
+        "Format Errors": format_errors,
+        "Number of examples missing system message": distributions["n_missing_system"],
+        "Number of examples missing user message": distributions["n_missing_user"],
+        "Cost Statistics": cost_stats,
+    }
+    fig_num_messages_distribution_plot = plt.figure()
+    num_messages_distribution_plot = plt.hist(distributions["n_messages"], bins=20)
+    fig_num_total_tokens_per_example_plot = plt.figure()
+    num_total_tokens_per_example_plot = plt.hist(distributions["convo_lens"], bins=20)
+    fig_num_assistant_tokens_per_example_plot = plt.figure()
+    num_assistant_tokens_per_example_plot = plt.hist(
+        distributions["assistant_message_lens"],
+        bins=20
+    )
+    # The DownloadFile component requires a path to the file, it can't accept a buffer to keep the file in memory.
+    # Therefore, we need to save the buffer to a file and then pass the path to the DownloadFile component.
+    # However, if different users are using the app at the same time, we need to make sure that the file is unique AND that no user can access the file of another user.
+    # We can use a UUID generator to create a unique file name.
+    uuid = str(uuid4())
+    file_path = f"training_examples_{uuid}.jsonl"
+    training_examples_ds.to_json(path_or_buf=file_path, force_ascii=False)
+    file_path_validation = f"validation_examples_{uuid}.jsonl"
+    validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
+    return (
+        file_path,
+        gr.update(visible=True),
+        file_path_validation,
+        gr.update(visible=True),
+        stats,
+        fig_num_messages_distribution_plot,
+        fig_num_total_tokens_per_example_plot,
+        fig_num_assistant_tokens_per_example_plot
+    )
+def remove_file_and_hide_button(file_path):
+    import os
+    try:
+        os.remove(file_path)
+    except Exception as e:
+        print(f"Error removing file {file_path}: {e}")
+    return gr.update(visible=False)
+theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia")
+with gr.Blocks(theme=theme) as demo:
+    gr.Markdown(
+        """
+        # WhatsApp Chat to Dataset Converter
+        Upload your WhatsApp chat files and convert them into a Dataset.
+        """
+    )
+    gr.Markdown(
+        """
+        ## Instructions
+        1. Click on the "Upload WhatsApp Chat Files" button.
+        2. Select the WhatsApp chat files you want to convert.
+        3. Write a prompt about you to give context to the training examples.
+        4. Click on the "Submit" button.
+        5. Wait for the process to finish.
+        6. Download the generated training examples as a JSONL file.
+        7. Use the training examples to train your own model.
+        """
+    )
+    input_files = gr.File(
+        label="Upload WhatsApp Chat Files",
+        type="filepath",
+        file_count="multiple",
+        file_types=["txt"],
+    )
+    system_prompt = gr.Textbox(
+        label="System Prompt",
+        placeholder="Background information about you.",
+        lines=5,
+        info="Enter the system prompt to be used for the training examples generation. This is the background information about you that will be used to generate the training examples.",
+        value="""Aldan is an AI researcher who loves to play around with AI systems, travelling and learning new things.""",
+    )
+    do_spelling_correction = gr.Checkbox(
+        label="Do Spelling Correction (English)",
+        info="Check this box if you want to perform spelling correction on the chat messages before generating the training examples.",
+    )
+    # Allow the user to choose the validation split size
+    validation_split = gr.Slider(
+        minimum=0.0,
+        maximum=0.5,
+        value=0.2,
+        interactive=True,
+        label="Validation Split",
+        info="Choose the percentage of the dataset to be used for validation. For example, if you choose 0.2, 20% of the dataset will be used for validation and 80% for training.",
+    )
+    submit = gr.Button(value="Submit", variant="primary")
+    output_file = gr.DownloadButton(label="Download Generated Training Examples", visible=False, variant="primary")
+    output_file_validation = gr.DownloadButton(label="Download Generated Validation Examples", visible=False, variant="secondary")
+    # output_example = gr.JSON(label="Example Training Example")
+    with gr.Group():
+        # Statistics about the dataset
+        gr.Markdown("## Statistics")
+        written_stats = gr.JSON()
+        num_messages_distribution_plot = gr.Plot(label="Number of Messages Distribution")
+        num_total_tokens_per_example_plot = gr.Plot(label="Total Number of Tokens per Example")
+        num_assistant_tokens_per_example_plot = gr.Plot(
+            label="Number of Assistant Tokens per Example"
+        )
+    submit.click(
+        file_upload_callback,
+        inputs=[input_files, system_prompt, do_spelling_correction, validation_split],
+        outputs=[
+            output_file,
+            output_file,
+            output_file_validation,
+            output_file_validation,
+            written_stats,
+            num_messages_distribution_plot,
+            num_total_tokens_per_example_plot,
+            num_assistant_tokens_per_example_plot,
+        ]
+    )
+    output_file.click(remove_file_and_hide_button, inputs=[output_file], outputs=[output_file])
+    output_file_validation.click(remove_file_and_hide_button, inputs=[output_file_validation], outputs=[output_file_validation])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+contextualSpellCheck==0.4.4
+datasets==2.18.0
+es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl#sha256=61e6e5530941f5880166855f09f60d7e6ba79ec1e8e45f96244bdb1eb169eb1d
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
+gradio==4.20.1
+matplotlib==3.8.3
+numpy==1.26.4
+pandas==2.2.1
+spacy==3.7.4
+tiktoken==0.6.0
+torch==2.2.1
+transformers==4.38.2
+pyspellchecker==0.8.1

utils.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import datasets
+import datetime
+import os
+import json
+import re
+exp = re.compile(
+    r"(?P<month>\d+)/(?P<day>\d+)/(?P<year>\d+), (?P<hour>\d+):(?P<minute>\d+) - (?P<contact_name>.+): (?P<message>.+)"
+)
+def process_line(example):
+    # The lines have this format: dd/mm/yy, hh:mm - <person>: <msg>
+    try:
+        groups = exp.match(example["text"]).groupdict()
+        timestamp = datetime.datetime(
+            int(groups["year"]),
+            int(groups["month"]),
+            int(groups["day"]),
+            int(groups["hour"]),
+            int(groups["minute"]),
+        ).timestamp()
+        return {
+            "message": groups["message"],
+            "contact_name": groups["contact_name"],
+            "timestamp": timestamp,
+        }
+    except Exception as e:
+        print(e)
+        print(example["text"])
+        raise e
+# %%
+# Now, create message groups ('conversations')
+# The idea is to group messages that are close in time
+# We'll use a 240 minute threshold
+MINUTES_THRESHOLD = 240
+def group_messages(messages_iterable):
+    groups = []
+    current_group = [next(messages_iterable)]
+    for message in messages_iterable:
+        assert len(current_group) > 0  # We should never have an empty group
+        if (
+            message["timestamp"] - current_group[-1]["timestamp"]
+            < MINUTES_THRESHOLD * 60
+        ):
+            current_group.append(message)
+        else:
+            groups.append(current_group)
+            current_group = [message]
+    groups.append(current_group)
+    return groups
+def printable_conversation(conversation):
+    return "\n".join(
+        [f"{message['contact_name']}: {message['message']}" for message in conversation]
+    )
+# %%
+# Use spacy to spell check the messages
+import spacy
+import contextualSpellCheck
+from spellchecker import SpellChecker
+spell = SpellChecker()
+#nlp = spacy.load("es_core_news_sm")
+nlp = spacy.load("en_core_web_sm")
+def spell_check_conversation(conversation):
+    for i, message in enumerate(conversation["conversations"]):
+        # Use SpaCy to get the words
+        words = spell.split_words(message["message"])
+        print(f"Words: {words}")
+        corrected_message = []
+        for word in words:
+            correction = spell.correction(word)
+            if (correction != None) and (correction != word):
+                print(f"Spell check: {word} -> {correction}")
+                corrected_message.append(correction)
+            else:
+                corrected_message.append(word)
+        print(f"Corrected message: {corrected_message}")
+        joined_message = " ".join(corrected_message)
+        conversation["conversations"][i]["message"] = joined_message
+    return conversation
+def spell_check_conversation_spacy(conversation):
+    nlp.add_pipe(
+        "contextual spellchecker",
+        config={
+            "model_name": "bert-base-multilingual-uncased",
+            "max_edit_dist": 2,
+        },
+    )
+    docs = list(nlp.pipe([msg["message"] for msg in conversation["conversations"]]))
+    for i, doc in enumerate(docs):
+        if doc._.performed_spellCheck:
+            print(f"Spell checked: {doc.text} -> {doc._.outcome_spellCheck}")
+            conversation["conversations"][i]["message"] = doc._.outcome_spellCheck
+    return conversation
+def remove_whatapp_annotations(conversation):
+    """
+    Removes the following annotations from the messages:
+    - <This message was edited>
+    """
+    for message in conversation["conversations"]:
+        message["message"] = re.sub(
+            r"<This message was edited>", "", message["message"]
+        )
+    return conversation
+# %%
+"""
+Sometimes, people write concurrently in the same conversation. We'll try to detect that and reorder the messages.
+For example, if we have a conversation like this:
+A: Hi
+A: How are you?
+B: Hi
+B: I'm fine, thanks
+A: I'm fine too
+We'll reorder it to:
+A: Hi
+B: Hi
+A: How are you?
+B: I'm fine, thanks
+A: I'm fine too
+To do it, we'll use MobileBERT with the next sentence prediction head. We'll use the first message as the first sentence, and the second message as the second sentence. If the model predicts that the second sentence is more likely to be the next sentence, we'll swap the messages.
+"""
+from transformers import AutoTokenizer, AutoModelForNextSentencePrediction
+import torch
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+model = AutoModelForNextSentencePrediction.from_pretrained("bert-base-uncased")
+if torch.cuda.is_available():
+    model.cuda()
+def swap_messages_if_needed(message1, message2):
+    # If the messages have the same contact, we don't swap them
+    if message1["contact_name"] == message2["contact_name"]:
+        return message1, message2
+    # The timestamp must have a difference of less than 2 minutes. First, convert to datetime
+    datetime1 = datetime.datetime.fromtimestamp(message1["timestamp"])
+    datetime2 = datetime.datetime.fromtimestamp(message2["timestamp"])
+    if (datetime2 - datetime1).total_seconds() > 2 * 60:
+        return message1, message2
+    # If one of the messages has less than 3 words, we don't swap them
+    if len(message1["message"].split()) < 3 or len(message2["message"].split()) < 3:
+        return message1, message2
+    # We'll use the first message as the first sentence, and the second message as the second sentence
+    inputs = tokenizer(message1["message"], message2["message"], return_tensors="pt")
+    reverse_inputs = tokenizer(
+        message2["message"], message1["message"], return_tensors="pt"
+    )
+    # Join them in a single batch
+    joined_inputs = torch.cat([inputs["input_ids"], reverse_inputs["input_ids"]], dim=0)
+    if torch.cuda.is_available():
+        joined_inputs = joined_inputs.cuda()
+    with torch.no_grad():
+        outputs = model(input_ids=joined_inputs)
+    # The output is a tuple with the logits for each class (next sentence or not)
+    # We'll take the first one (next sentence)
+    logits = outputs[0]
+    # Apply softmax
+    logits = torch.softmax(logits, dim=1)
+    # We have two probabilities: the probability of 1 -> 2, and the probability of 2 -> 1
+    # We'll take the difference
+    swap = logits[0, 0] - logits[1, 0] < -0.2
+    if swap:
+        # Swap the messages
+        print(f"YES Swapping messages: {message1['message']} <-> {message2['message']}")
+        return message2, message1
+    else:
+        # print(f"NOT swapping messages: {message1['message']} <-> {message2['message']}")
+        return message1, message2
+def swap_messages_if_needed_in_conversation(conversation):
+    # We'll use the first message as the first sentence, and the second message as the second sentence
+    if len(conversation) <= 2:
+        return conversation
+    new_conversation = [
+        conversation[0],
+        conversation[1],
+    ]  # We'll always keep the first message in the same position
+    for i in range(2, len(conversation)):
+        message1 = new_conversation[-1]
+        message2 = conversation[i]
+        message1, message2 = swap_messages_if_needed(message1, message2)
+        new_conversation[-1] = message1
+        new_conversation.append(message2)
+    # print(f"\nOriginal conversation:\n{printable_conversation(conversation)}")
+    # print(f"\nNew conversation:\n{printable_conversation(new_conversation)}")
+    return new_conversation
+test_conversation = [
+    {"message": "Hola!", "contact_name": "A", "timestamp": 1},
+    {
+        "message": "Está todo bien, gracias por preguntar!",
+        "contact_name": "B",
+        "timestamp": 2,
+    },
+    {
+        "message": "Hola, qué tal estás? Espero que vaya todo bien por España.",
+        "contact_name": "A",
+        "timestamp": 3,
+    },
+]
+# print(swap_messages_if_needed_in_conversation(test_conversation))
+# %%
+# Now, we'll train an mT5 model to generate the next message in a conversation
+import os
+# For the contact_name, rewrite everything that is not 'Aldi' to 'Other'
+def rewrite_contact_name(conversation):
+    for message in conversation["conversations"]:
+        if message["contact_name"] != "Aldi":
+            message["contact_name"] = "Other"
+    return conversation
+# %%
+def process_chat_file(file, do_spelling_correction, do_reordering=False):
+    """
+    Process a chat file and return a dataset with the conversations.
+    """
+    ds = (
+        datasets.load_dataset("text", data_files=[file])["train"]
+        .filter(
+            # Has to begin by date, time, contact name, and contain at least a ':' symbol
+            lambda x: re.match(
+                r"^\d{1,2}/\d{1,2}/\d{1,2},\s\d{2}:\d{2}\s-\s.+:", x["text"]
+            )
+        )
+        .map(process_line, remove_columns=["text"])
+    )
+    # Filter out messages that just say '<Media omitted>'
+    ds = ds.filter(lambda x: x["message"] != "<Media omitted>")
+    groups = group_messages(iter(ds))
+    # Generate the dataset
+    conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
+    # Filter out conversations with less than 10 messages
+    conversations_ds = conversations_ds.filter(lambda x: len(x["conversations"]) >= 10)
+    conversations_ds_without_whatsapp_annotations = conversations_ds.map(
+        remove_whatapp_annotations,
+        num_proc=os.cpu_count() - 1,
+    )
+    if do_spelling_correction:
+        spell_checked_conversations_ds = (
+            conversations_ds_without_whatsapp_annotations.map(spell_check_conversation)
+        )
+    else:
+        spell_checked_conversations_ds = conversations_ds_without_whatsapp_annotations
+    if do_reordering:
+        reordered_conversations_ds = spell_checked_conversations_ds.map(
+            swap_messages_if_needed_in_conversation
+        )
+    else:
+        reordered_conversations_ds = spell_checked_conversations_ds
+    changed_contact_name_ds = reordered_conversations_ds.map(
+        rewrite_contact_name
+    )  # , num_proc=os.cpu_count() - 1)
+    # Filter out conversations with only one contact
+    changed_contact_name_ds = changed_contact_name_ds.filter(
+        lambda x: len(set([msg["contact_name"] for msg in x["conversations"]])) > 1
+    )
+    return changed_contact_name_ds
+def transform_conversations_dataset_into_training_examples(
+    conversations_ds, system_prompt
+):
+    """
+    Takes in a dataset with conversations and returns a dataset with training examples.
+    The input dataset contains a single column (conversations), with each row being a list of messages with this format:
+    ```
+    [{'contact_name': 'Aldi', 'message': <message>, 'timestamp': <time>}, {'contact_name': 'Other', 'message': <message>, 'timestamp': <time>}, ... ]
+    ```
+    Each row will be converted to fit the format of the training examples.
+    The training examples have the following format:
+    ```
+    {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris"}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
+    {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "William Shakespeare"}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
+    {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "384,400 kilometers"}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
+    ```
+    """
+    def process_one_example(example):
+        messages = [{"role": "system", "content": [system_prompt]}]
+        for msg in example["conversations"]:
+            converted_role = "assistant" if msg["contact_name"] == "Aldi" else "user"
+            if converted_role == messages[-1]["role"]:
+                messages[-1]["content"] += [msg["message"]]
+            else:
+                messages.append({"role": converted_role, "content": [msg["message"]]})
+        return {
+            "messages": [
+                {
+                    "role": m["role"],
+                    "content": json.dumps(m["content"], ensure_ascii=False),
+                }
+                for m in messages
+            ]
+        }
+    return conversations_ds.map(
+        process_one_example,
+        remove_columns=["conversations"],
+        num_proc=os.cpu_count() - 1,
+    )

validation.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import numpy as np
+from collections import defaultdict
+import tiktoken
+def check_format_errors(train_dataset):
+    """
+    Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
+    """
+    # Format error checks
+    format_errors = defaultdict(int)
+    for ex in train_dataset:
+        if not isinstance(ex, dict):
+            format_errors["data_type"] += 1
+            continue
+        messages = ex.get("messages", None)
+        if not messages:
+            format_errors["missing_messages_list"] += 1
+            continue
+        for message in messages:
+            if "role" not in message or "content" not in message:
+                format_errors["message_missing_key"] += 1
+            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
+                format_errors["message_unrecognized_key"] += 1
+            if message.get("role", None) not in ("system", "user", "assistant", "function"):
+                format_errors["unrecognized_role"] += 1
+            content = message.get("content", None)
+            function_call = message.get("function_call", None)
+            if (not content and not function_call) or not isinstance(content, str):
+                format_errors["missing_content"] += 1
+        if not any(message.get("role", None) == "assistant" for message in messages):
+            format_errors["example_missing_assistant_message"] += 1
+    if format_errors:
+        print("Found errors:")
+        for k, v in format_errors.items():
+            print(f"{k}: {v}")
+    else:
+        print("No errors found")
+    return format_errors if format_errors else {}
+def get_distributions(train_dataset):
+    """
+    Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
+    Gets the distributions of the number of messages per example, the total number of tokens per example, and the number of assistant tokens per example.
+    """
+    encoding = tiktoken.get_encoding("cl100k_base")
+    # not exact!
+    # simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
+    def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
+        num_tokens = 0
+        for message in messages:
+            num_tokens += tokens_per_message
+            for key, value in message.items():
+                num_tokens += len(encoding.encode(value))
+                if key == "name":
+                    num_tokens += tokens_per_name
+        num_tokens += 3
+        return num_tokens
+    def num_assistant_tokens_from_messages(messages):
+        num_tokens = 0
+        for message in messages:
+            if message["role"] == "assistant":
+                num_tokens += len(encoding.encode(message["content"]))
+        return num_tokens
+    n_missing_system = 0
+    n_missing_user = 0
+    n_messages = []
+    convo_lens = []
+    assistant_message_lens = []
+    for ex in train_dataset:
+        messages = ex["messages"]
+        if not any(message["role"] == "system" for message in messages):
+            n_missing_system += 1
+        if not any(message["role"] == "user" for message in messages):
+            n_missing_user += 1
+        n_messages.append(len(messages))
+        convo_lens.append(num_tokens_from_messages(messages))
+        assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
+    return {
+        "n_missing_system": n_missing_system,
+        "n_missing_user": n_missing_user,
+        "n_messages": n_messages,
+        "convo_lens": convo_lens,
+        "assistant_message_lens": assistant_message_lens
+    }
+def check_token_counts(train_dataset):
+    """
+    Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
+    """
+    def print_distribution(values, name):
+        print(f"\n#### Distribution of {name}:")
+        print(f"min / max: {min(values)}, {max(values)}")
+        print(f"mean / median: {np.mean(values)}, {np.median(values)}")
+        print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")
+    # Warnings and tokens counts
+    distributions = get_distributions(train_dataset)
+    n_missing_system = distributions["n_missing_system"]
+    n_missing_user = distributions["n_missing_user"]
+    n_messages = distributions["n_messages"]
+    convo_lens = distributions["convo_lens"]
+    assistant_message_lens = distributions["assistant_message_lens"]
+    print("Num examples missing system message:", n_missing_system)
+    print("Num examples missing user message:", n_missing_user)
+    print_distribution(n_messages, "num_messages_per_example")
+    print_distribution(convo_lens, "num_total_tokens_per_example")
+    print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
+    n_too_long = sum(l > 4096 for l in convo_lens)
+    print(
+        f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning"
+    )
+    return
+def estimate_cost(train_dataset):
+    """
+    Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
+    """
+    distributions = get_distributions(train_dataset)
+    n_missing_system = distributions["n_missing_system"]
+    n_missing_user = distributions["n_missing_user"]
+    n_messages = distributions["n_messages"]
+    convo_lens = distributions["convo_lens"]
+    assistant_message_lens = distributions["assistant_message_lens"]
+    # Pricing and default n_epochs estimate
+    MAX_TOKENS_PER_EXAMPLE = 4096
+    TARGET_EPOCHS = 3
+    MIN_TARGET_EXAMPLES = 100
+    MAX_TARGET_EXAMPLES = 25000
+    MIN_DEFAULT_EPOCHS = 1
+    MAX_DEFAULT_EPOCHS = 25
+    n_epochs = TARGET_EPOCHS
+    n_train_examples = len(train_dataset)
+    if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
+        n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
+    elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
+        n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)
+    n_billing_tokens_in_dataset = sum(
+        min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens
+    )
+    return {
+        "Estimated number of tokens in dataset": n_billing_tokens_in_dataset,
+        f"Estimated number of tokens that will be billed (assuming {n_epochs} training epochs)": n_epochs * n_billing_tokens_in_dataset
+    }