aliacar commited on
Commit
b0a3195
1 Parent(s): 7a27dcc

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +114 -0
README.md ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ ---
5
+ # How to use model
6
+
7
+ ## Load model and tokenizer
8
+ ```
9
+ import torch
10
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
11
+
12
+ torch.set_default_device("cuda")
13
+
14
+ model_name = "dcipheranalytics/phi-2-pii-bbi"
15
+
16
+ quantization_config = BitsAndBytesConfig(
17
+ load_in_4bit=True,
18
+ bnb_4bit_compute_dtype=torch.bfloat16,
19
+ bnb_4bit_quant_type="nf4",
20
+ )
21
+
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_name,
24
+ device_map="auto",
25
+ # torch_dtype="auto",
26
+ torch_dtype=torch.bfloat16,
27
+ trust_remote_code=True,
28
+ quantization_config=quantization_config,
29
+ )
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
32
+ ```
33
+
34
+ ## Call generate method
35
+ ```
36
+ def generate(msg: str, max_new_tokens = 300, temperature=0.3):
37
+ chat_template = "<|im_start|>user\n{msg}<|im_end|><|im_start|>assistant\n"
38
+ prompt = chat_template.format(msg=msg)
39
+
40
+ with torch.no_grad():
41
+ token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
42
+ output_ids = model.generate(
43
+ token_ids.to(model.device),
44
+ max_new_tokens=max_new_tokens,
45
+ do_sample=True,
46
+ temperature=temperature,
47
+ pad_token_id=tokenizer.eos_token_id,
48
+ eos_token_id=tokenizer.eos_token_id,
49
+ )
50
+ output = tokenizer.decode(output_ids[0][token_ids.size(1):-1]).strip()
51
+ return output
52
+
53
+ instruction_template = "List the personally identifiable information in the given text below.\nText:########\n{text}\n########"
54
+ text_with_pii = "My passport number is 123456789."
55
+ generate(instruction_template.format(text=text_with_pii))
56
+ ```
57
+
58
+
59
+ ## Batch predictions
60
+ ```
61
+ from transformers import TextGenerationPipeline
62
+
63
+ def get_prompt(text):
64
+ instruction_template = "List the personally identifiable information in the given text below.\nText:########\n{text}\n########"
65
+ msg = instruction_template.format(text=text)
66
+ chat_template = "<|im_start|>user\n{msg}<|im_end|><|im_start|>assistant\n"
67
+ prompt = chat_template.format(msg=msg)
68
+
69
+ return prompt
70
+
71
+ generator = TextGenerationPipeline(
72
+ model=model,
73
+ tokenizer=tokenizer,
74
+ max_new_tokens=300,
75
+ do_sample=True,
76
+ temperature=0.3,
77
+ pad_token_id=tokenizer.eos_token_id,
78
+ eos_token_id=tokenizer.eos_token_id,
79
+ )
80
+
81
+ texts = ["My passport number is 123456789.",
82
+ "My name is John Smith.",
83
+ ]
84
+ prompts = list(map(get_prompt, texts))
85
+ outputs = generator(prompts,
86
+ return_full_text=False,
87
+ batch_size=2)
88
+ ```
89
+
90
+ # Train Data
91
+
92
+ GPT4 generated customer service conversations.
93
+ 1. 100 unique banking topics, 8 examples per each,
94
+ 2. New 100 banking topics, 4 examples per each,
95
+ 3. 100 insurance topics, 4 examples per each.
96
+
97
+ # Evaluation Results
98
+
99
+ ## Average
100
+ ```
101
+ precision 0.836223
102
+ recall 0.781132
103
+ f1 0.801837
104
+ ```
105
+
106
+ ## Per topic:
107
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/63ea400bb1d9c4ef71ebb962/wUfwR-dmmyxF4pCYoebCX.png)
108
+
109
+ ## On TAB test split:
110
+ ```
111
+ precision 0.506118
112
+ recall 0.350976
113
+ f1 0.391614
114
+ ```