AmberLJC commited on
Commit
906b628
·
1 Parent(s): 19b22c9
sharegpt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
sharegpt/README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Download ShareGPT :
3
+ ```
4
+ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
5
+
6
+ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json
7
+ ```
8
+
9
+ ## Install Fastchat
10
+ ```
11
+ pip3 install fastchat
12
+ ```
13
+
14
+ ## Clean data:
15
+ ```
16
+ pip3 install polyglot pyicu pycld2
17
+ python3 -m fastchat.data.optional_clean --in sg_90k_part1_html_cleaned.json --out sg_90k_part1_html_cleaned_lang.json --keep-lang en
18
+ ```
19
+
20
+ ## Extract first sentence (optional)
21
+ ```
22
+ python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file sg_90k_part1_html_cleaned_lang_first.json
23
+ ```
24
+
25
+ ## Sample data (optional)
26
+ ```
27
+ python3 -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
28
+ ```
29
+
30
+ ## ShareGPT Feeder Usage
31
+
32
+ ```
33
+ from sharegpt_feeder import generator
34
+ sharegpt_generator = generator()
35
+ print(next(sharegpt_generator))
36
+ print(next(sharegpt_generator))
37
+ ```
sharegpt/extract_first.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+
4
+
5
+ def extract_first_sen(content):
6
+ result = []
7
+ for item in content:
8
+ tmp = item
9
+ tmp['conversations'] = [item['conversations'][0]]
10
+ result.append(tmp)
11
+ return result
12
+
13
+
14
+ def main(args):
15
+ content = json.load(open(args["in_file"], "r"))
16
+ content = extract_first_sen(content )
17
+ json.dump(content, open(args["out_file"], "w"), indent=2)
18
+
19
+ if __name__ == "__main__":
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--in-file", type=str, default = 'sg_90k_part1_html_cleaned_lang.json' )
22
+ parser.add_argument("--out-file", type=str, default = "sg_90k_part1_html_cleaned_lang_first.json")
23
+ args = parser.parse_args()
24
+ main(vars(args))
25
+
26
+
sharegpt/sharegpt_feeder.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ''' Usage
2
+ sharegpt_generator = sharegpt_generator()
3
+ print(next(sharegpt_generator))
4
+ print(next(sharegpt_generator))
5
+ print(next(sharegpt_generator))
6
+ '''
7
+ import json
8
+
9
+ def sharegpt_generator(file = 'sg_90k_part1_html_cleaned_lang.json'):
10
+ content = json.load(open(file, "r"))
11
+ for item in content:
12
+ yield item['conversations'][0]['value']
13
+
14
+
15
+