Spaces:

ml-energy
/

leaderboard

Running

ohjuny commited on Feb 19, 2024

Commit

2afd5bf

•

1 Parent(s): 710a635

Filter Conversations Not Started By "human" (#35)

Files changed (4) hide show

sharegpt/README.md CHANGED Viewed

@@ -46,7 +46,7 @@ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolv
 pip install transformers
 ```
-### Filter conversations with too long prompts/responses, extract first turn, and randomly sample 500 prompts
 ```
 python filter_dataset.py
 ```

 pip install transformers
 ```
+### Filter conversations with too long prompts/responses, conversations not started by "human", extract first turn, and randomly sample 500 prompts
 ```
 python filter_dataset.py
 ```

sharegpt/ShareGPT_V3_filtered_500.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

sharegpt/compare_distributions.py CHANGED Viewed

@@ -25,7 +25,7 @@ for name, data_list in zip(names, data_lists):
 # Get prompt lengths using tokenizer
 tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 all_prompts = [
-    [data["conversations"][0]["value"] for data in data_lists]
     for data_lists in data_lists
 ]
 all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts]

 # Get prompt lengths using tokenizer
 tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 all_prompts = [
+    [data["conversations"][0]["value"] for data in data_lists if data["conversations"][0]["from"] == "human"]
     for data_lists in data_lists
 ]
 all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts]

sharegpt/filter_dataset.py CHANGED Viewed

@@ -37,14 +37,14 @@ def filter_dataset(
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
     dataset = [
         (
             data["id"],
             data["conversations"][0]["value"],
             data["conversations"][1]["value"],
         )
-        for data in dataset
     ]
     # Tokenize the prompts and completions.
@@ -98,10 +98,13 @@ def main():
     with open("ShareGPT_V3_filtered.json", "w") as f:
         json.dump(filtered_dataset, f)
     sampled_dataset = filter_dataset_to_size("ShareGPT_V3_filtered.json", 500)
     with open("ShareGPT_V3_filtered_500.json", "w") as f:
         json.dump(sampled_dataset, f)
 if __name__ == "__main__":
     main()

         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation, where the first turn is human.
     dataset = [
         (
             data["id"],
             data["conversations"][0]["value"],
             data["conversations"][1]["value"],
         )
+        for data in dataset if data["conversations"][0]["from"] == "human"
     ]
     # Tokenize the prompts and completions.
     with open("ShareGPT_V3_filtered.json", "w") as f:
         json.dump(filtered_dataset, f)
+    print(f'Created filtered benchmark of size: {len(filtered_dataset)}')
     sampled_dataset = filter_dataset_to_size("ShareGPT_V3_filtered.json", 500)
     with open("ShareGPT_V3_filtered_500.json", "w") as f:
         json.dump(sampled_dataset, f)
+    print(f'Created sampled benchmark of size: {len(sampled_dataset)}')
 if __name__ == "__main__":
     main()