hsuvaskakoty commited on
Commit
9a3ff71
1 Parent(s): 7ba3a06

Upload model_predict.py

Browse files
Files changed (1) hide show
  1. model_predict.py +58 -58
model_predict.py CHANGED
@@ -1,39 +1,5 @@
1
  #using pipeline to predict the input text
2
- # from transformers import pipeline, AutoTokenizer
3
- # import torch
4
-
5
- # label_mapping = {
6
- # 'delete': [0, 'LABEL_0'],
7
- # 'keep': [1, 'LABEL_1'],
8
- # 'merge': [2, 'LABEL_2'],
9
- # 'no consensus': [3, 'LABEL_3'],
10
- # 'speedy keep': [4, 'LABEL_4'],
11
- # 'speedy delete': [5, 'LABEL_5'],
12
- # 'redirect': [6, 'LABEL_6'],
13
- # 'withdrawn': [7, 'LABEL_7']
14
- # }
15
-
16
- # def predict_text(text, model_name):
17
- # tokenizer = AutoTokenizer.from_pretrained(model_name)
18
- # model = pipeline("text-classification", model=model_name, return_all_scores=True)
19
-
20
- # # Tokenize and truncate the text
21
- # tokens = tokenizer(text, truncation=True, max_length=512)
22
- # truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
23
-
24
- # results = model(truncated_text)
25
- # final_scores = {key: 0.0 for key in label_mapping}
26
-
27
- # for result in results[0]:
28
- # for key, value in label_mapping.items():
29
- # if result['label'] == value[1]:
30
- # final_scores[key] = result['score']
31
- # break
32
-
33
- # return final_scores
34
-
35
-
36
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
37
  import torch
38
 
39
  label_mapping = {
@@ -49,35 +15,69 @@ label_mapping = {
49
 
50
  def predict_text(text, model_name):
51
  tokenizer = AutoTokenizer.from_pretrained(model_name)
52
- model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)
53
 
54
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
55
- outputs = model(**inputs)
56
- predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
57
 
 
58
  final_scores = {key: 0.0 for key in label_mapping}
59
- for i, score in enumerate(predictions[0]):
 
60
  for key, value in label_mapping.items():
61
- if i == value[0]:
62
- final_scores[key] = score.item()
63
  break
64
 
65
- # Calculate average attention
66
- attentions = outputs.attentions
67
- avg_attentions = torch.mean(torch.stack(attentions), dim=1) # Average over all layers
68
- avg_attentions = avg_attentions.mean(dim=1)[0] # Average over heads
69
- token_importance = avg_attentions.mean(dim=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Decode tokens and highlight important ones
72
- tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
73
- highlighted_text = []
74
- for token, importance in zip(tokens, token_importance):
75
- if importance > token_importance.mean():
76
- highlighted_text.append(f"<b>{token}</b>") #
77
- else:
78
- highlighted_text.append(token)
79
 
80
- highlighted_text = " ".join(highlighted_text)
81
- highlighted_text = highlighted_text.replace("##", "")
82
 
83
- return final_scores, highlighted_text
 
1
  #using pipeline to predict the input text
2
+ from transformers import pipeline, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import torch
4
 
5
  label_mapping = {
 
15
 
16
  def predict_text(text, model_name):
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ model = pipeline("text-classification", model=model_name, return_all_scores=True)
19
 
20
+ # Tokenize and truncate the text
21
+ tokens = tokenizer(text, truncation=True, max_length=512)
22
+ truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
23
 
24
+ results = model(truncated_text)
25
  final_scores = {key: 0.0 for key in label_mapping}
26
+
27
+ for result in results[0]:
28
  for key, value in label_mapping.items():
29
+ if result['label'] == value[1]:
30
+ final_scores[key] = result['score']
31
  break
32
 
33
+ return final_scores
34
+
35
+
36
+ # from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
37
+ # import torch
38
+
39
+ # label_mapping = {
40
+ # 'delete': [0, 'LABEL_0'],
41
+ # 'keep': [1, 'LABEL_1'],
42
+ # 'merge': [2, 'LABEL_2'],
43
+ # 'no consensus': [3, 'LABEL_3'],
44
+ # 'speedy keep': [4, 'LABEL_4'],
45
+ # 'speedy delete': [5, 'LABEL_5'],
46
+ # 'redirect': [6, 'LABEL_6'],
47
+ # 'withdrawn': [7, 'LABEL_7']
48
+ # }
49
+
50
+ # def predict_text(text, model_name):
51
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
52
+ # model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)
53
+
54
+ # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
55
+ # outputs = model(**inputs)
56
+ # predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
57
+
58
+ # final_scores = {key: 0.0 for key in label_mapping}
59
+ # for i, score in enumerate(predictions[0]):
60
+ # for key, value in label_mapping.items():
61
+ # if i == value[0]:
62
+ # final_scores[key] = score.item()
63
+ # break
64
+
65
+ # # Calculate average attention
66
+ # attentions = outputs.attentions
67
+ # avg_attentions = torch.mean(torch.stack(attentions), dim=1) # Average over all layers
68
+ # avg_attentions = avg_attentions.mean(dim=1)[0] # Average over heads
69
+ # token_importance = avg_attentions.mean(dim=0)
70
 
71
+ # # Decode tokens and highlight important ones
72
+ # tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
73
+ # highlighted_text = []
74
+ # for token, importance in zip(tokens, token_importance):
75
+ # if importance > token_importance.mean():
76
+ # highlighted_text.append(f"<b>{token}</b>") #
77
+ # else:
78
+ # highlighted_text.append(token)
79
 
80
+ # highlighted_text = " ".join(highlighted_text)
81
+ # highlighted_text = highlighted_text.replace("##", "")
82
 
83
+ # return final_scores, highlighted_text