Haon-Chen commited on
Commit
1bd8201
1 Parent(s): 5226e20

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +119 -112
README.md CHANGED
@@ -1,112 +1,119 @@
1
- ---
2
- license: mit
3
- ---
4
-
5
- ## SPEED-synthesis-7b-senior
6
-
7
- [Little Giants: Synthesizing High-Quality Embedding Data at Scale](https://arxiv.org/pdf/2410.18634.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2024
8
-
9
- This is the data revisor model of SPEED.
10
-
11
- ## Usage
12
-
13
- Below is an example to revise s2s data using this revisor.
14
-
15
- The prompts and misc scripts can be found in our [github page](https://github.com/haon-chen/SPEED)
16
-
17
- ### Transformers
18
-
19
- ```python
20
- import torch
21
- import os
22
- import random
23
- import numpy as np
24
- import json
25
-
26
-
27
- from torch import Tensor
28
- from transformers import AutoTokenizer, AutoModelForCausalLM
29
- from typing import List, Dict, Optional
30
-
31
- from prompts_aligning import get_create_all_revise_data_prompt
32
- from utils import fix_common_json_errors_and_loads_for_revisor
33
-
34
-
35
- LLAMA3_PROMPT = """
36
- {prompt} [/INST]
37
- """.strip("\n")
38
-
39
- # Each query must come with a one-sentence instruction that describes the task
40
- old_prompts = [
41
- "You have been assigned a text matching task: Match a Stockard Channing movie title with a brief plot description.\n\nYour mission is to write one example for this task in JSON format. The JSON object must contain the following keys:\n- \"input\": a string, a random input specified by the task.\n- \"positive_document\": a string, a relevant document for the \"input\" according to the task.\n\nPlease adhere to the following guidelines:\n- The values of all fields should be in English.\n- Both the \"input\" and \"positive_document\" should be very short (a sentence or a phrase), avoid substantial word overlaps, otherwise the task would be too easy.\n- The \"input\" and \"positive_document\" should be independent of each other.\n\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!"
42
- ]
43
- old_data = [
44
- {"input": "Stockard Channing in 'The Business of Strangers', directed by Patrick Stettner.", "positive_document": "In 'The Business of Strangers', Channing stars as a businesswoman who embarks on a ruthless journey, after which she undergoes a drastic change. She faces many challenges while pursuing her goals and eventually comes out stronger."},
45
- ]
46
- language = 'English'
47
-
48
- prompts = [LLAMA3_PROMPT.format(prompt=get_create_all_revise_data_prompt(prompt=old_prompt, data=json.dumps(data))[1]['content']) for old_prompt in old_prompts for data in old_data]
49
-
50
- tokenizer = AutoTokenizer.from_pretrained('Haon-Chen/speed-synthesis-7b-revisor')
51
- model = AutoModelForCausalLM.from_pretrained('Haon-Chen/speed-synthesis-7b-revisor')
52
- model.to("cuda:0")
53
- tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
54
- tokenizer.padding_side = "left"
55
- tokenizer.truncation_side = "left"
56
-
57
- # Tokenize the input texts
58
- encodes = tokenizer(prompts, padding="longest", add_special_tokens=True, return_tensors="pt")
59
- input_ids = encodes.input_ids.to(model.device)
60
- attention_mask = encodes.attention_mask.to(model.device)
61
-
62
- GEN_CONFIG = {"do_sample":True, "temperature": 1.0, "top_p": 1.0, "max_new_tokens": 800}
63
- output = model.generate(
64
- input_ids=input_ids,
65
- attention_mask=attention_mask,
66
- pad_token_id = tokenizer.eos_token_id,
67
- **GEN_CONFIG
68
- )
69
- output_texts = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
70
- batch_results = []
71
- for i in range(len(output_texts)):
72
- batch_results.append(output_texts[i][len(prompts[i]):].strip(' '))
73
-
74
- bad_cnt=0
75
- outputs = []
76
- for i, result in enumerate(batch_results):
77
- try:
78
- content = fix_common_json_errors_and_loads_for_revisor(result)
79
- revision = content["revision"]
80
- reason = content["reason"]
81
-
82
- user_query = revision.get("input", "")
83
- positive_document = revision.get("positive_document", "")
84
- except:
85
- bad_cnt+=1
86
- continue
87
- out_data = {
88
- "query": user_query,
89
- "positives": [positive_document],
90
- "negatives": [],
91
- "language": "English",
92
- "reason": reason,
93
- }
94
- outputs.append(out_data)
95
- print(bad_cnt)
96
- print(outputs)
97
- ```
98
-
99
- ## Citation
100
-
101
- If you find our paper or models helpful, please consider cite as follows:
102
-
103
- ```bibtex
104
- @article{chen2024little,
105
- title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
106
- author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
107
- journal={arXiv preprint arXiv:2410.18634},
108
- year={2024}
109
- }
110
- ```
111
-
112
- ## Limitations
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ base_model:
6
+ - meta-llama/Meta-Llama-3-8B
7
+ pipeline_tag: text-generation
8
+ tags:
9
+ - transformers
10
+ ---
11
+
12
+ ## SPEED-synthesis-7b-senior
13
+
14
+ [Little Giants: Synthesizing High-Quality Embedding Data at Scale](https://arxiv.org/pdf/2410.18634.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2024
15
+
16
+ This is the data revisor model of SPEED.
17
+
18
+ ## Usage
19
+
20
+ Below is an example to revise s2s data using this revisor.
21
+
22
+ The prompts and misc scripts can be found in our [github page](https://github.com/haon-chen/SPEED)
23
+
24
+ ### Transformers
25
+
26
+ ```python
27
+ import torch
28
+ import os
29
+ import random
30
+ import numpy as np
31
+ import json
32
+
33
+
34
+ from torch import Tensor
35
+ from transformers import AutoTokenizer, AutoModelForCausalLM
36
+ from typing import List, Dict, Optional
37
+
38
+ from prompts_aligning import get_create_all_revise_data_prompt
39
+ from utils import fix_common_json_errors_and_loads_for_revisor
40
+
41
+
42
+ LLAMA3_PROMPT = """
43
+ {prompt} [/INST]
44
+ """.strip("\n")
45
+
46
+ # Each query must come with a one-sentence instruction that describes the task
47
+ old_prompts = [
48
+ "You have been assigned a text matching task: Match a Stockard Channing movie title with a brief plot description.\n\nYour mission is to write one example for this task in JSON format. The JSON object must contain the following keys:\n- \"input\": a string, a random input specified by the task.\n- \"positive_document\": a string, a relevant document for the \"input\" according to the task.\n\nPlease adhere to the following guidelines:\n- The values of all fields should be in English.\n- Both the \"input\" and \"positive_document\" should be very short (a sentence or a phrase), avoid substantial word overlaps, otherwise the task would be too easy.\n- The \"input\" and \"positive_document\" should be independent of each other.\n\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!"
49
+ ]
50
+ old_data = [
51
+ {"input": "Stockard Channing in 'The Business of Strangers', directed by Patrick Stettner.", "positive_document": "In 'The Business of Strangers', Channing stars as a businesswoman who embarks on a ruthless journey, after which she undergoes a drastic change. She faces many challenges while pursuing her goals and eventually comes out stronger."},
52
+ ]
53
+ language = 'English'
54
+
55
+ prompts = [LLAMA3_PROMPT.format(prompt=get_create_all_revise_data_prompt(prompt=old_prompt, data=json.dumps(data))[1]['content']) for old_prompt in old_prompts for data in old_data]
56
+
57
+ tokenizer = AutoTokenizer.from_pretrained('Haon-Chen/speed-synthesis-7b-revisor')
58
+ model = AutoModelForCausalLM.from_pretrained('Haon-Chen/speed-synthesis-7b-revisor')
59
+ model.to("cuda:0")
60
+ tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
61
+ tokenizer.padding_side = "left"
62
+ tokenizer.truncation_side = "left"
63
+
64
+ # Tokenize the input texts
65
+ encodes = tokenizer(prompts, padding="longest", add_special_tokens=True, return_tensors="pt")
66
+ input_ids = encodes.input_ids.to(model.device)
67
+ attention_mask = encodes.attention_mask.to(model.device)
68
+
69
+ GEN_CONFIG = {"do_sample":True, "temperature": 1.0, "top_p": 1.0, "max_new_tokens": 800}
70
+ output = model.generate(
71
+ input_ids=input_ids,
72
+ attention_mask=attention_mask,
73
+ pad_token_id = tokenizer.eos_token_id,
74
+ **GEN_CONFIG
75
+ )
76
+ output_texts = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
77
+ batch_results = []
78
+ for i in range(len(output_texts)):
79
+ batch_results.append(output_texts[i][len(prompts[i]):].strip(' '))
80
+
81
+ bad_cnt=0
82
+ outputs = []
83
+ for i, result in enumerate(batch_results):
84
+ try:
85
+ content = fix_common_json_errors_and_loads_for_revisor(result)
86
+ revision = content["revision"]
87
+ reason = content["reason"]
88
+
89
+ user_query = revision.get("input", "")
90
+ positive_document = revision.get("positive_document", "")
91
+ except:
92
+ bad_cnt+=1
93
+ continue
94
+ out_data = {
95
+ "query": user_query,
96
+ "positives": [positive_document],
97
+ "negatives": [],
98
+ "language": "English",
99
+ "reason": reason,
100
+ }
101
+ outputs.append(out_data)
102
+ print(bad_cnt)
103
+ print(outputs)
104
+ ```
105
+
106
+ ## Citation
107
+
108
+ If you find our paper or models helpful, please consider cite as follows:
109
+
110
+ ```bibtex
111
+ @article{chen2024little,
112
+ title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
113
+ author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
114
+ journal={arXiv preprint arXiv:2410.18634},
115
+ year={2024}
116
+ }
117
+ ```
118
+
119
+ ## Limitations