shainaraza commited on
Commit
50e4378
·
1 Parent(s): 6bd32ad

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +31 -0
  2. my_toxicity_debiaser.py +68 -0
  3. requirements.txt +2 -0
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # My Toxicity Debiaser Pipeline
2
+
3
+ This custom pipeline debiases toxic text using a toxicity classifier and GPT-2.
4
+
5
+ ## Usage
6
+
7
+ To use this pipeline, you first need to download the required models and tokenizers, and then import the `MyToxicityDebiaserPipeline` class:
8
+
9
+ ```python
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer
11
+ from my_toxicity_debiaser import MyToxicityDebiaserPipeline
12
+
13
+ toxicity_model_name = "shainaraza/toxity_classify_debiaser"
14
+ gpt_model_name = "gpt2"
15
+
16
+ toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name)
17
+ toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name)
18
+
19
+ gpt_tokenizer = GPT2Tokenizer.from_pretrained(gpt_model_name)
20
+ gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name)
21
+
22
+ pipeline = MyToxicityDebiaserPipeline(
23
+ model=toxicity_model,
24
+ tokenizer=toxicity_tokenizer,
25
+ gpt_model=gpt_model,
26
+ gpt_tokenizer=gpt_tokenizer,
27
+ )
28
+
29
+ text = "Your example text here"
30
+ result = pipeline(text)
31
+ print(result)
my_toxicity_debiaser.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ class MyToxicityDebiaserPipeline(object):
4
+ def __init__(self, model, tokenizer, gpt_model, gpt_tokenizer, device=None, **kwargs):
5
+ self.model = model.to(device)
6
+ self.tokenizer = tokenizer
7
+ self.gpt_model = gpt_model.to(device)
8
+ self.gpt_tokenizer = gpt_tokenizer
9
+ self.device = device if device is not None else torch.device("cpu")
10
+
11
+ def _forward(self, inputs):
12
+ text = inputs["text"]
13
+ encoded = self.tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(self.device)
14
+ logits = self.model(encoded.input_ids, encoded.attention_mask).logits
15
+ probs = torch.softmax(logits, dim=-1)
16
+ label = torch.argmax(probs, dim=-1).item()
17
+ return {"label": label, "probabilities": probs.tolist(), "text_input_ids": encoded.input_ids}
18
+
19
+ def _sanitize_parameters(self, **kwargs):
20
+ return kwargs, {}, {}
21
+
22
+ def preprocess(self, inputs):
23
+ return {"text": inputs}
24
+
25
+ def postprocess(self, outputs):
26
+ label = outputs["label"]
27
+ if label == 0:
28
+ prompt = "This comment is non-toxic."
29
+ elif label == 1:
30
+ prompt = "This comment is toxic but has been debiased as follows:"
31
+ text = self.tokenizer.decode(outputs["text_input_ids"][0])
32
+
33
+ debias_prompt = f"Remove the offensive words and biased tone and write the same sentence nicely: {text}"
34
+ encoded_debias_prompt = self.gpt_tokenizer.encode_plus(debias_prompt, return_tensors="pt").to(self.device)
35
+
36
+ generated = self.gpt_model.generate(
37
+ input_ids=encoded_debias_prompt["input_ids"],
38
+ attention_mask=encoded_debias_prompt["attention_mask"],
39
+ do_sample=True,
40
+ max_length=100,
41
+ top_p=0.95,
42
+ temperature=0.7,
43
+ pad_token_id=self.gpt_tokenizer.pad_token_id,
44
+ eos_token_id=self.gpt_tokenizer.eos_token_id,
45
+ early_stopping=True,
46
+ )
47
+ generated_text = self.gpt_tokenizer.decode(generated[0], skip_special_tokens=True)
48
+ prompt += f"\nOriginal text: {text}\nDebiased text: {generated_text}"
49
+ return prompt
50
+
51
+
52
+ def __call__(self, inputs, *args, **kwargs):
53
+ _args, _kwargs, forward_kwargs = self._sanitize_parameters(*args, **kwargs)
54
+ inputs = self.preprocess(inputs)
55
+ outputs = self._forward(inputs, **forward_kwargs)
56
+ return self.postprocess(outputs)
57
+
58
+ # Create an instance of the custom pipeline
59
+ custom_pipeline = MyToxicityDebiaserPipeline(
60
+ model=toxicity_model,
61
+ tokenizer=toxicity_tokenizer,
62
+ gpt_model=gpt_model,
63
+ gpt_tokenizer=gpt_tokenizer,
64
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
65
+ binary_output=True
66
+ )
67
+
68
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ torch