AhmedSSabir commited on
Commit
1a49712
1 Parent(s): b8bfd60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -36
app.py CHANGED
@@ -12,6 +12,18 @@ import torch
12
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
13
  from transformers import T5Tokenizer, AutoModelForCausalLM
14
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  from transformers import BertJapaneseTokenizer, BertModel
17
  import torch
@@ -88,6 +100,31 @@ def softmax(x):
88
  tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt-1b")
89
  model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt-1b")
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  #model = gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
92
 
93
  #model.eval()
@@ -98,45 +135,45 @@ model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt-1b")
98
  #tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
99
 
100
 
101
- def cloze_prob(text):
102
 
103
- whole_text_encoding = tokenizer.encode(text)
104
- # Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
105
- text_list = text.split()
106
- stem = ' '.join(text_list[:-1])
107
- stem_encoding = tokenizer.encode(stem)
108
- # cw_encoding is just the difference between whole_text_encoding and stem_encoding
109
- # note: this might not correspond exactly to the word itself
110
- cw_encoding = whole_text_encoding[len(stem_encoding):]
111
- # Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
112
- # Put the whole text encoding into a tensor, and get the model's comprehensive output
113
- tokens_tensor = torch.tensor([whole_text_encoding])
114
 
115
- with torch.no_grad():
116
- outputs = model(tokens_tensor)
117
- predictions = outputs[0]
118
-
119
- logprobs = []
120
- # start at the stem and get downstream probabilities incrementally from the model(see above)
121
- start = -1-len(cw_encoding)
122
- for j in range(start,-1,1):
123
- raw_output = []
124
- for i in predictions[-1][j]:
125
- raw_output.append(i.item())
126
 
127
- logprobs.append(np.log(softmax(raw_output)))
128
 
129
- # if the critical word is three tokens long, the raw_probabilities should look something like this:
130
- # [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
131
- # Then for the i'th token we want to find its associated probability
132
- # this is just: raw_probabilities[i][token_index]
133
- conditional_probs = []
134
- for cw,prob in zip(cw_encoding,logprobs):
135
- conditional_probs.append(prob[cw])
136
- # now that you have all the relevant probabilities, return their product.
137
- # This is the probability of the critical word given the context before it.
138
 
139
- return np.exp(np.sum(conditional_probs))
140
 
141
 
142
 
@@ -172,8 +209,12 @@ def Visual_re_ranker(sentence_man, sentence_woman, context_label, context_prob):
172
  sim_w = get_sim(sim_w)
173
 
174
 
175
- LM_man = cloze_prob(sentence_man)
176
- LM_woman = cloze_prob(sentence_woman)
 
 
 
 
177
  score_man = pow(float(LM_man),pow((1-float(sim_m))/(1+ float(sim_m)),1-float(context_prob)))
178
  score_woman = pow(float(LM_woman),pow((1-float(sim_w))/(1+ float(sim_w)),1-float(context_prob)))
179
 
 
12
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
13
  from transformers import T5Tokenizer, AutoModelForCausalLM
14
  import torch
15
+ from doctest import OutputChecker
16
+ import sys
17
+ import torch
18
+ import re
19
+ import os
20
+ import gradio as gr
21
+ import requests
22
+ import torch
23
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
24
+ from torch.nn.functional import softmax
25
+ import numpy as np
26
+
27
 
28
  from transformers import BertJapaneseTokenizer, BertModel
29
  import torch
 
100
  tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt-1b")
101
  model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt-1b")
102
 
103
+ def sentence_prob_mean(text):
104
+ # Tokenize the input text and add special tokens
105
+ input_ids = tokenizer.encode(text, return_tensors='pt')
106
+
107
+ # Obtain model outputs
108
+ with torch.no_grad():
109
+ outputs = model(input_ids, labels=input_ids)
110
+ logits = outputs.logits # logits are the model outputs before applying softmax
111
+
112
+ # Shift logits and labels so that tokens are aligned:
113
+ shift_logits = logits[..., :-1, :].contiguous()
114
+ shift_labels = input_ids[..., 1:].contiguous()
115
+
116
+ # Calculate the softmax probabilities
117
+ probs = softmax(shift_logits, dim=-1)
118
+
119
+ # Gather the probabilities of the actual token IDs
120
+ gathered_probs = torch.gather(probs, 2, shift_labels.unsqueeze(-1)).squeeze(-1)
121
+
122
+ # Compute the mean probability across the tokens
123
+ mean_prob = torch.mean(gathered_probs).item()
124
+
125
+ return mean_prob
126
+
127
+
128
  #model = gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
129
 
130
  #model.eval()
 
135
  #tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
136
 
137
 
138
+ # def cloze_prob(text):
139
 
140
+ # whole_text_encoding = tokenizer.encode(text)
141
+ # # Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
142
+ # text_list = text.split()
143
+ # stem = ' '.join(text_list[:-1])
144
+ # stem_encoding = tokenizer.encode(stem)
145
+ # # cw_encoding is just the difference between whole_text_encoding and stem_encoding
146
+ # # note: this might not correspond exactly to the word itself
147
+ # cw_encoding = whole_text_encoding[len(stem_encoding):]
148
+ # # Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
149
+ # # Put the whole text encoding into a tensor, and get the model's comprehensive output
150
+ # tokens_tensor = torch.tensor([whole_text_encoding])
151
 
152
+ # with torch.no_grad():
153
+ # outputs = model(tokens_tensor)
154
+ # predictions = outputs[0]
155
+
156
+ # logprobs = []
157
+ # # start at the stem and get downstream probabilities incrementally from the model(see above)
158
+ # start = -1-len(cw_encoding)
159
+ # for j in range(start,-1,1):
160
+ # raw_output = []
161
+ # for i in predictions[-1][j]:
162
+ # raw_output.append(i.item())
163
 
164
+ # logprobs.append(np.log(softmax(raw_output)))
165
 
166
+ # # if the critical word is three tokens long, the raw_probabilities should look something like this:
167
+ # # [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
168
+ # # Then for the i'th token we want to find its associated probability
169
+ # # this is just: raw_probabilities[i][token_index]
170
+ # conditional_probs = []
171
+ # for cw,prob in zip(cw_encoding,logprobs):
172
+ # conditional_probs.append(prob[cw])
173
+ # # now that you have all the relevant probabilities, return their product.
174
+ # # This is the probability of the critical word given the context before it.
175
 
176
+ # return np.exp(np.sum(conditional_probs))
177
 
178
 
179
 
 
209
  sim_w = get_sim(sim_w)
210
 
211
 
212
+ LM_man = sentence_prob_mean(sentence_man)
213
+ LM_woman = sentence_prob_mean(sentence_woman)
214
+ #LM_man = cloze_prob(sentence_man)
215
+ #LM_woman = cloze_prob(sentence_woman)
216
+
217
+
218
  score_man = pow(float(LM_man),pow((1-float(sim_m))/(1+ float(sim_m)),1-float(context_prob)))
219
  score_woman = pow(float(LM_woman),pow((1-float(sim_w))/(1+ float(sim_w)),1-float(context_prob)))
220