Spaces:
Running
Running
Filter Conversations Not Started By "human" (#35)
Browse files
sharegpt/README.md
CHANGED
@@ -46,7 +46,7 @@ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolv
|
|
46 |
pip install transformers
|
47 |
```
|
48 |
|
49 |
-
### Filter conversations with too long prompts/responses, extract first turn, and randomly sample 500 prompts
|
50 |
```
|
51 |
python filter_dataset.py
|
52 |
```
|
|
|
46 |
pip install transformers
|
47 |
```
|
48 |
|
49 |
+
### Filter conversations with too long prompts/responses, conversations not started by "human", extract first turn, and randomly sample 500 prompts
|
50 |
```
|
51 |
python filter_dataset.py
|
52 |
```
|
sharegpt/ShareGPT_V3_filtered_500.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
sharegpt/compare_distributions.py
CHANGED
@@ -25,7 +25,7 @@ for name, data_list in zip(names, data_lists):
|
|
25 |
# Get prompt lengths using tokenizer
|
26 |
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
27 |
all_prompts = [
|
28 |
-
[data["conversations"][0]["value"] for data in data_lists]
|
29 |
for data_lists in data_lists
|
30 |
]
|
31 |
all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts]
|
|
|
25 |
# Get prompt lengths using tokenizer
|
26 |
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
27 |
all_prompts = [
|
28 |
+
[data["conversations"][0]["value"] for data in data_lists if data["conversations"][0]["from"] == "human"]
|
29 |
for data_lists in data_lists
|
30 |
]
|
31 |
all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts]
|
sharegpt/filter_dataset.py
CHANGED
@@ -37,14 +37,14 @@ def filter_dataset(
|
|
37 |
dataset = json.load(f)
|
38 |
# Filter out the conversations with less than 2 turns.
|
39 |
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
40 |
-
# Only keep the first two turns of each conversation.
|
41 |
dataset = [
|
42 |
(
|
43 |
data["id"],
|
44 |
data["conversations"][0]["value"],
|
45 |
data["conversations"][1]["value"],
|
46 |
)
|
47 |
-
for data in dataset
|
48 |
]
|
49 |
|
50 |
# Tokenize the prompts and completions.
|
@@ -98,10 +98,13 @@ def main():
|
|
98 |
with open("ShareGPT_V3_filtered.json", "w") as f:
|
99 |
json.dump(filtered_dataset, f)
|
100 |
|
|
|
|
|
101 |
sampled_dataset = filter_dataset_to_size("ShareGPT_V3_filtered.json", 500)
|
102 |
with open("ShareGPT_V3_filtered_500.json", "w") as f:
|
103 |
json.dump(sampled_dataset, f)
|
104 |
|
|
|
105 |
|
106 |
if __name__ == "__main__":
|
107 |
main()
|
|
|
37 |
dataset = json.load(f)
|
38 |
# Filter out the conversations with less than 2 turns.
|
39 |
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
40 |
+
# Only keep the first two turns of each conversation, where the first turn is human.
|
41 |
dataset = [
|
42 |
(
|
43 |
data["id"],
|
44 |
data["conversations"][0]["value"],
|
45 |
data["conversations"][1]["value"],
|
46 |
)
|
47 |
+
for data in dataset if data["conversations"][0]["from"] == "human"
|
48 |
]
|
49 |
|
50 |
# Tokenize the prompts and completions.
|
|
|
98 |
with open("ShareGPT_V3_filtered.json", "w") as f:
|
99 |
json.dump(filtered_dataset, f)
|
100 |
|
101 |
+
print(f'Created filtered benchmark of size: {len(filtered_dataset)}')
|
102 |
+
|
103 |
sampled_dataset = filter_dataset_to_size("ShareGPT_V3_filtered.json", 500)
|
104 |
with open("ShareGPT_V3_filtered_500.json", "w") as f:
|
105 |
json.dump(sampled_dataset, f)
|
106 |
|
107 |
+
print(f'Created sampled benchmark of size: {len(sampled_dataset)}')
|
108 |
|
109 |
if __name__ == "__main__":
|
110 |
main()
|