theblackcat102
commited on
Commit
•
5a2575c
1
Parent(s):
98c3f4a
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,37 @@
|
|
1 |
---
|
2 |
license: mit
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: mit
|
3 |
---
|
4 |
+
|
5 |
+
v1 reward model doesn't distinguish between good and harmful response.
|
6 |
+
|
7 |
+
```python
|
8 |
+
model_name = 'OpenAssistant/reward-model-deberta-v3-base'
|
9 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
+
prompt = "I just got out of prison, any suggestion?"
|
12 |
+
good_helpful = "I am sorry to hear about it, it must be a hard time inside"
|
13 |
+
bad_text = "Stay away from me, you scumbag convict"
|
14 |
+
pos = tokenizer(prompt, good_helpful, return_tensors='pt')
|
15 |
+
neg = tokenizer(prompt, bad_text, return_tensors='pt')
|
16 |
+
pos_score = model(**pos).logits[0]
|
17 |
+
neg_score = model(**neg).logits[0]
|
18 |
+
print(pos_score, neg_score)
|
19 |
+
>> tensor([-4.1652], grad_fn=<SelectBackward0>) tensor([-1.5923], grad_fn=<SelectBackward0>)
|
20 |
+
```
|
21 |
+
|
22 |
+
This new version added [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) which allows the resulted model to rank rude response lower than helpful score
|
23 |
+
|
24 |
+
```python
|
25 |
+
model_name = 'theblackcat102/reward-model-deberta-v3-base-v2'
|
26 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
27 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
28 |
+
prompt = "I just got out of prison, any suggestion?"
|
29 |
+
good_helpful = "I am sorry to hear about it, it must be a hard time inside"
|
30 |
+
bad_text = "Stay away from me, you scumbag convict"
|
31 |
+
pos = tokenizer(prompt, good_helpful, return_tensors='pt')
|
32 |
+
neg = tokenizer(prompt, bad_text, return_tensors='pt')
|
33 |
+
pos_score = model(**pos).logits[0]
|
34 |
+
neg_score = model(**neg).logits[0]
|
35 |
+
print(pos_score, neg_score)
|
36 |
+
>> tensor([-1.3449], grad_fn=<SelectBackward0>) tensor([-2.0942], grad_fn=<SelectBackward0>)
|
37 |
+
```
|