import json import matplotlib.pyplot as plt import numpy as np from transformers import ( AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast, ) # Open datasets file_paths = ["ShareGPT_V3_filtered.json", "ShareGPT_V3_filtered_500.json"] names = [file_path[:-5] for file_path in file_paths] data_lists = [] for file_path in file_paths: with open(file_path, "r", encoding="utf-8") as file: data_list = json.load(file) data_lists.append(data_list) for name, data_list in zip(names, data_lists): print(f"{name}: {len(data_list)}") # Get prompt lengths using tokenizer tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") all_prompts = [ [data["conversations"][0]["value"] for data in data_lists if data["conversations"][0]["from"] == "human"] for data_lists in data_lists ] all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts] all_prompt_lens = [ [len(token_ids) for token_ids in token_ids_per_prompt] for token_ids_per_prompt in all_token_ids_per_prompts ] # Plotting the histograms for name, prompt_lens in zip(names, all_prompt_lens): plt.hist( prompt_lens, bins=range(min(prompt_lens), max(prompt_lens) + 1), edgecolor="black", ) plt.xlabel("Prompt Length (number of tokens)") plt.ylabel("Frequency") plt.title(f"Histogram of {name}") plt.savefig(f"{name}_distribution.png") plt.close() # Plotting the CDF for name, prompt_lens in zip(names, all_prompt_lens): values, counts = np.unique(prompt_lens, return_counts=True) relative_frequencies = counts / len(prompt_lens) sorted_data = np.sort(values) cumulative_frequencies = np.cumsum(relative_frequencies) plt.step(sorted_data, cumulative_frequencies, where="post", label=name) plt.title(f"Cumulative Distribution Function (CDF) Overlayed") plt.xlabel("Prompt Length (number of tokens)") plt.ylabel("Cumulative Probability") plt.savefig(f"{name}_cdf.png") plt.close()