Haon-Chen commited on
Commit
394f0b9
·
verified ·
1 Parent(s): 944cb49

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +118 -111
README.md CHANGED
@@ -1,111 +1,118 @@
1
- ---
2
- license: mit
3
- ---
4
-
5
- ## SPEED-synthesis-7b-senior
6
-
7
- [Little Giants: Synthesizing High-Quality Embedding Data at Scale](https://arxiv.org/pdf/2410.18634.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2024
8
-
9
- This is the senior data synthesis model of SPEED.
10
-
11
- ## Usage
12
-
13
- Below is an example to synthesize classification data using this senior generator.
14
-
15
- The prompts and misc scripts can be found in our [github page](https://github.com/haon-chen/SPEED)
16
-
17
- ### Transformers
18
-
19
- ```python
20
- import torch
21
- import os
22
- import random
23
- import numpy as np
24
- import json
25
- import re
26
-
27
- from torch import Tensor
28
- from transformers import AutoTokenizer, AutoModelForCausalLM
29
-
30
- from prompts_synthesis import get_create_classify_data_prompt
31
- from utils import fix_common_json_errors_and_loads
32
-
33
-
34
- LLAMA3_PROMPT = """
35
- {prompt} [/INST]
36
- """.strip("\n")
37
-
38
- # Each query must come with a one-sentence instruction that describes the task
39
- tasks = [
40
- 'Identify the intended age group for educational technology products.',
41
- 'Classify businesses based on their operational hours.'
42
- ]
43
- language = 'English'
44
-
45
- prompts = [LLAMA3_PROMPT.format(prompt=get_create_classify_data_prompt(task=task, language=language)[1]['content']) for task in tasks]
46
-
47
- tokenizer = AutoTokenizer.from_pretrained('Haon-Chen/speed-synthesis-7b-senior')
48
- model = AutoModelForCausalLM.from_pretrained('Haon-Chen/speed-synthesis-7b-senior')
49
- model.to("cuda:0")
50
- model.eval()
51
- tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
52
- tokenizer.padding_side = "left"
53
- tokenizer.truncation_side = "left"
54
-
55
- with torch.inference_mode():
56
- # Tokenize the input texts
57
- encodes = tokenizer(prompts, padding="longest", add_special_tokens=True, return_tensors="pt")
58
- input_ids = encodes.input_ids.to(model.device)
59
- attention_mask = encodes.attention_mask.to(model.device)
60
-
61
- # Set the generation parameters
62
- GEN_CONFIG = {"do_sample":True, "temperature": 1.0, "top_p": 1.0, "max_new_tokens": 800}
63
- output = model.generate(
64
- input_ids=input_ids,
65
- attention_mask=attention_mask,
66
- pad_token_id = tokenizer.eos_token_id,
67
- **GEN_CONFIG
68
- )
69
- output_texts = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
70
- batch_results = []
71
- for i in range(len(output_texts)):
72
- batch_results.append(output_texts[i][len(prompts[i]):].strip(' '))
73
-
74
- # Format outputs
75
- bad_cnt=0
76
- outputs = []
77
- for i, result in enumerate(batch_results):
78
- try:
79
- output = fix_common_json_errors_and_loads(result)
80
- user_query = output.get("input_text", "")
81
- positive_document = output.get("label", "")
82
- hard_negative_document = output.get("misleading_label", "")
83
- except:
84
- bad_cnt+=1
85
- continue
86
- out_data = {
87
- "query": user_query,
88
- "positives": [positive_document],
89
- "negatives": [hard_negative_document],
90
- "language": "English",
91
- "task_definition": tasks[i],
92
- }
93
- outputs.append(out_data)
94
- print(bad_cnt)
95
- print(outputs)
96
- ```
97
-
98
- ## Citation
99
-
100
- If you find our paper or models helpful, please consider cite as follows:
101
-
102
- ```bibtex
103
- @article{chen2024little,
104
- title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
105
- author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
106
- journal={arXiv preprint arXiv:2410.18634},
107
- year={2024}
108
- }
109
- ```
110
-
111
- ## Limitations
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ base_model:
6
+ - meta-llama/Meta-Llama-3-8B
7
+ pipeline_tag: text-generation
8
+ tags:
9
+ - transformers
10
+ ---
11
+
12
+ ## SPEED-synthesis-7b-senior
13
+
14
+ [Little Giants: Synthesizing High-Quality Embedding Data at Scale](https://arxiv.org/pdf/2410.18634.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2024
15
+
16
+ This is the senior data synthesis model of SPEED.
17
+
18
+ ## Usage
19
+
20
+ Below is an example to synthesize classification data using this senior generator.
21
+
22
+ The prompts and misc scripts can be found in our [github page](https://github.com/haon-chen/SPEED)
23
+
24
+ ### Transformers
25
+
26
+ ```python
27
+ import torch
28
+ import os
29
+ import random
30
+ import numpy as np
31
+ import json
32
+ import re
33
+
34
+ from torch import Tensor
35
+ from transformers import AutoTokenizer, AutoModelForCausalLM
36
+
37
+ from prompts_synthesis import get_create_classify_data_prompt
38
+ from utils import fix_common_json_errors_and_loads
39
+
40
+
41
+ LLAMA3_PROMPT = """
42
+ {prompt} [/INST]
43
+ """.strip("\n")
44
+
45
+ # Each query must come with a one-sentence instruction that describes the task
46
+ tasks = [
47
+ 'Identify the intended age group for educational technology products.',
48
+ 'Classify businesses based on their operational hours.'
49
+ ]
50
+ language = 'English'
51
+
52
+ prompts = [LLAMA3_PROMPT.format(prompt=get_create_classify_data_prompt(task=task, language=language)[1]['content']) for task in tasks]
53
+
54
+ tokenizer = AutoTokenizer.from_pretrained('Haon-Chen/speed-synthesis-7b-senior')
55
+ model = AutoModelForCausalLM.from_pretrained('Haon-Chen/speed-synthesis-7b-senior')
56
+ model.to("cuda:0")
57
+ model.eval()
58
+ tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
59
+ tokenizer.padding_side = "left"
60
+ tokenizer.truncation_side = "left"
61
+
62
+ with torch.inference_mode():
63
+ # Tokenize the input texts
64
+ encodes = tokenizer(prompts, padding="longest", add_special_tokens=True, return_tensors="pt")
65
+ input_ids = encodes.input_ids.to(model.device)
66
+ attention_mask = encodes.attention_mask.to(model.device)
67
+
68
+ # Set the generation parameters
69
+ GEN_CONFIG = {"do_sample":True, "temperature": 1.0, "top_p": 1.0, "max_new_tokens": 800}
70
+ output = model.generate(
71
+ input_ids=input_ids,
72
+ attention_mask=attention_mask,
73
+ pad_token_id = tokenizer.eos_token_id,
74
+ **GEN_CONFIG
75
+ )
76
+ output_texts = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
77
+ batch_results = []
78
+ for i in range(len(output_texts)):
79
+ batch_results.append(output_texts[i][len(prompts[i]):].strip(' '))
80
+
81
+ # Format outputs
82
+ bad_cnt=0
83
+ outputs = []
84
+ for i, result in enumerate(batch_results):
85
+ try:
86
+ output = fix_common_json_errors_and_loads(result)
87
+ user_query = output.get("input_text", "")
88
+ positive_document = output.get("label", "")
89
+ hard_negative_document = output.get("misleading_label", "")
90
+ except:
91
+ bad_cnt+=1
92
+ continue
93
+ out_data = {
94
+ "query": user_query,
95
+ "positives": [positive_document],
96
+ "negatives": [hard_negative_document],
97
+ "language": "English",
98
+ "task_definition": tasks[i],
99
+ }
100
+ outputs.append(out_data)
101
+ print(bad_cnt)
102
+ print(outputs)
103
+ ```
104
+
105
+ ## Citation
106
+
107
+ If you find our paper or models helpful, please consider cite as follows:
108
+
109
+ ```bibtex
110
+ @article{chen2024little,
111
+ title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
112
+ author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
113
+ journal={arXiv preprint arXiv:2410.18634},
114
+ year={2024}
115
+ }
116
+ ```
117
+
118
+ ## Limitations