ohjuny commited on
Commit
2afd5bf
1 Parent(s): 710a635

Filter Conversations Not Started By "human" (#35)

Browse files
sharegpt/README.md CHANGED
@@ -46,7 +46,7 @@ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolv
46
  pip install transformers
47
  ```
48
 
49
- ### Filter conversations with too long prompts/responses, extract first turn, and randomly sample 500 prompts
50
  ```
51
  python filter_dataset.py
52
  ```
 
46
  pip install transformers
47
  ```
48
 
49
+ ### Filter conversations with too long prompts/responses, conversations not started by "human", extract first turn, and randomly sample 500 prompts
50
  ```
51
  python filter_dataset.py
52
  ```
sharegpt/ShareGPT_V3_filtered_500.json CHANGED
The diff for this file is too large to render. See raw diff
 
sharegpt/compare_distributions.py CHANGED
@@ -25,7 +25,7 @@ for name, data_list in zip(names, data_lists):
25
  # Get prompt lengths using tokenizer
26
  tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
27
  all_prompts = [
28
- [data["conversations"][0]["value"] for data in data_lists]
29
  for data_lists in data_lists
30
  ]
31
  all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts]
 
25
  # Get prompt lengths using tokenizer
26
  tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
27
  all_prompts = [
28
+ [data["conversations"][0]["value"] for data in data_lists if data["conversations"][0]["from"] == "human"]
29
  for data_lists in data_lists
30
  ]
31
  all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts]
sharegpt/filter_dataset.py CHANGED
@@ -37,14 +37,14 @@ def filter_dataset(
37
  dataset = json.load(f)
38
  # Filter out the conversations with less than 2 turns.
39
  dataset = [data for data in dataset if len(data["conversations"]) >= 2]
40
- # Only keep the first two turns of each conversation.
41
  dataset = [
42
  (
43
  data["id"],
44
  data["conversations"][0]["value"],
45
  data["conversations"][1]["value"],
46
  )
47
- for data in dataset
48
  ]
49
 
50
  # Tokenize the prompts and completions.
@@ -98,10 +98,13 @@ def main():
98
  with open("ShareGPT_V3_filtered.json", "w") as f:
99
  json.dump(filtered_dataset, f)
100
 
 
 
101
  sampled_dataset = filter_dataset_to_size("ShareGPT_V3_filtered.json", 500)
102
  with open("ShareGPT_V3_filtered_500.json", "w") as f:
103
  json.dump(sampled_dataset, f)
104
 
 
105
 
106
  if __name__ == "__main__":
107
  main()
 
37
  dataset = json.load(f)
38
  # Filter out the conversations with less than 2 turns.
39
  dataset = [data for data in dataset if len(data["conversations"]) >= 2]
40
+ # Only keep the first two turns of each conversation, where the first turn is human.
41
  dataset = [
42
  (
43
  data["id"],
44
  data["conversations"][0]["value"],
45
  data["conversations"][1]["value"],
46
  )
47
+ for data in dataset if data["conversations"][0]["from"] == "human"
48
  ]
49
 
50
  # Tokenize the prompts and completions.
 
98
  with open("ShareGPT_V3_filtered.json", "w") as f:
99
  json.dump(filtered_dataset, f)
100
 
101
+ print(f'Created filtered benchmark of size: {len(filtered_dataset)}')
102
+
103
  sampled_dataset = filter_dataset_to_size("ShareGPT_V3_filtered.json", 500)
104
  with open("ShareGPT_V3_filtered_500.json", "w") as f:
105
  json.dump(sampled_dataset, f)
106
 
107
+ print(f'Created sampled benchmark of size: {len(sampled_dataset)}')
108
 
109
  if __name__ == "__main__":
110
  main()