patent commited on
Commit
1efaf3d
1 Parent(s): 2d41018
app.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ import requests
4
+
5
+ import os
6
+ import json
7
+ import glob
8
+ import re
9
+ import random
10
+ import difflib
11
+
12
+ from random import randrange
13
+
14
+ enable_summary_button = False
15
+
16
+ prefix_lst = [
17
+ "pgj_d_4096",
18
+ "pgj_d_2048",
19
+ "pgj_d_1024_v2",
20
+ "pgj_d_1024_layer_14",
21
+ "pgj_d_1024_layer_7",
22
+ "pgj_d_1024_layer_2",
23
+ "pgj_d_1024_layer_1" ]
24
+
25
+ model_names = {
26
+ prefix_lst[0]: 'PatentGPT-J-6B',
27
+ prefix_lst[1]: 'PatentGPT-J-1.6B',
28
+ prefix_lst[2]: 'PatentGPT-J-456M',
29
+ prefix_lst[3]: 'PatentGPT-J-279M',
30
+ prefix_lst[4]: 'PatentGPT-J-191M',
31
+ prefix_lst[5]: 'PatentGPT-J-128M',
32
+ prefix_lst[6]: 'PatentGPT-J-115M',}
33
+
34
+ # experiment 3
35
+ folder = os.path.join('experiments', 'non_patent')
36
+ id_to_scroll = 1 # which of the above to scroll through
37
+ first_claim_only = True
38
+
39
+ #experiment 2
40
+ # folder = os.path.join('experiments', 'ipg20220104_500')
41
+ # #folder = "device_serve_results"
42
+ # id_to_scroll = 1 # which of the above to scroll through
43
+ # first_claim_only = False
44
+
45
+ # prefix_lst = ["my_gptj_6b_tpu_size_8", "pgj_d_4096", "pgj_d_2048", "pgj_d_1024_layer_14", "pgj_d_1024_layer_7", "pgj_d_1024_layer_2", "pgj_d_1024_layer_1"]
46
+ # #, "pgj_large", "pgj_medium", "pgj_small", ]
47
+ # # "pgj_d_1024_layer_14"
48
+
49
+ # experiment 1
50
+ # folder = os.path.join('experiments', 'ipg22_500')
51
+ # # (previous) folder = "eval_ipg22_500"
52
+ # id_to_scroll = 1 # which of the above to scroll through
53
+ # first_claim_only = True
54
+
55
+ ignore_outscope = True # ignore pick > 10
56
+
57
+ def handle_char_return(text):
58
+ if text == '(none)': # unicorn text
59
+ text == ''
60
+
61
+ return text
62
+
63
+ def calc_details(base_fn):
64
+ full_fn = os.path.join(folder, base_fn)
65
+ if os.path.exists(full_fn) == False:
66
+ return None, -1, -1, None, None, None, None, None
67
+
68
+ with open(full_fn) as f:
69
+ result = json.loads(f.read())
70
+ print("Loaded: %s" % full_fn)
71
+
72
+ lst = result['output']
73
+ recv = result['recv']
74
+ sum_pick = 0
75
+ sum_prob = 0
76
+ sum_outscope_count = 0
77
+ sum_outscope_len = 0
78
+ sum_hit_1 = 0
79
+ sum_top_10_len = 0
80
+ full_text = ''
81
+
82
+ token_count = 0
83
+ for i, tk in enumerate(lst[:-1]):
84
+ token_text = handle_char_return(tk['actual_next_token_text'])
85
+ next_top_seq = int(tk['actual_next_token_top_seq'])
86
+ next_top_prob = float(tk['actual_next_token_top_prob'])
87
+
88
+ full_text += token_text
89
+ if next_top_seq == 0:
90
+ sum_hit_1 += 1 # press "tab" for the top pick
91
+
92
+ if ignore_outscope and next_top_seq>=10:
93
+ sum_outscope_count += 1
94
+ sum_outscope_len += len(token_text) # use length as keystrokes
95
+ else:
96
+ sum_pick += min(next_top_seq+1, len(token_text))
97
+ #sum_pick += (next_top_seq+1) # press "down" & "tab"
98
+ sum_prob += next_top_prob
99
+ sum_top_10_len += len(token_text)
100
+
101
+ token_count += 1
102
+
103
+ if ignore_outscope:
104
+ if token_count == 0: # unlikely
105
+ avg_pick = 0
106
+ avg_prob = 0
107
+ else:
108
+ avg_pick = float(sum_pick) / token_count
109
+ avg_prob = float(sum_prob) / token_count
110
+ else:
111
+ avg_pick = float(sum_pick) / token_count
112
+ avg_prob = float(sum_prob) / token_count
113
+
114
+ return result, avg_pick, avg_prob, token_count, sum_pick, sum_prob, sum_outscope_count, sum_outscope_len, sum_hit_1, sum_top_10_len, full_text
115
+
116
+ def show_avg(base_fn, model_name, patent_claim_num, show_pick=False):
117
+ result, avg_pick, avg_prob, token_count, sum_pick, sum_prob, sum_outscope_count, sum_outscope_len, sum_hit_1, sum_top_10_len, full_text = calc_details(base_fn)
118
+
119
+ if result is None:
120
+ return None
121
+
122
+ lst = result['output']
123
+ result = ''
124
+ sum_all = {}
125
+ for i, tk in enumerate(lst):
126
+ token_text = handle_char_return(tk['actual_next_token_text'])
127
+ if token_text == '<|end_of_claim|>':
128
+ break
129
+
130
+ if token_text == '(none)': # for unicorn text
131
+ break
132
+
133
+ pick = int(tk['actual_next_token_top_seq'])
134
+ prob = float(tk['actual_next_token_top_prob'])
135
+
136
+ colors = [
137
+ ['00ff00', '000000', '1'],
138
+ ['008800', 'ffffff', '2-10'],
139
+ ['ff0000', 'ffffff', 'out of top 10'],
140
+ ]
141
+
142
+ for i, item in enumerate(colors):
143
+ sum_all[item[2]] = 0
144
+
145
+ if pick == 0:
146
+ bg_color = colors[0][0]
147
+ fg_color = colors[0][1]
148
+ tag = colors[0][2]
149
+ sum_all[tag] += 1
150
+ elif pick >= 1 and pick < 10:
151
+ bg_color = colors[1][0]
152
+ fg_color = colors[1][1]
153
+ tag = colors[1][2]
154
+ sum_all[tag] += 1
155
+ else: # pick >= 10
156
+ #elif pick >= 10 and pick < 100:
157
+ bg_color = colors[2][0]
158
+ fg_color = colors[2][1]
159
+ tag = colors[2][2]
160
+ sum_all[tag] += 1
161
+
162
+ if show_pick:
163
+ pick = '[%s]' % pick
164
+ else:
165
+ pick = ''
166
+
167
+ result += "<span style=background-color:#%s;color:#%s;border-radius:5px;>%s%s</span> " % (bg_color, fg_color, token_text, pick) #&nbsp;
168
+
169
+ color_msg = ''
170
+ for i, v in enumerate(colors):
171
+ color_msg += "<span style=background-color:#%s;color:#%s;border-radius:5px;>&nbsp;%s&nbsp;</span> " % (v[0], v[1], v[2])
172
+
173
+ # sum_pick as top 1~10
174
+ keys_with_auto = (sum_pick+sum_outscope_len)
175
+ keys_without_auto = len(full_text)
176
+ saved_ratio = float(keys_without_auto-keys_with_auto)/keys_without_auto * 100
177
+ s = 'model: %s\n' \
178
+ 'Autocomplete Effectiveness: %.1f%% (keystrokes saved)\n' \
179
+ 'Total keystrokes: %s (with autocomplete), %s (without autocomplete)\n' \
180
+ 'Keystroke distribution: top 1~10: %s (top 1: %s), out of top 10: %s' % (model_name, saved_ratio, keys_with_auto, keys_without_auto, sum_pick, sum_hit_1, sum_outscope_len)
181
+ st.text(s)
182
+
183
+ s = color_msg
184
+ st.markdown(s, unsafe_allow_html=True)
185
+ st.markdown(result, unsafe_allow_html=True)
186
+ sum_lst = [sum_all['1'], sum_all['2-10'], sum_all['out of top 10']]
187
+
188
+ return sum_lst
189
+
190
+ def show_overall_summary(prefix_lst, select_lst):
191
+ for prefix in prefix_lst:
192
+ acc_token_count = 0
193
+ acc_sum_pick = 0
194
+ acc_sum_prob = 0
195
+ acc_sum_outscope_count = 0
196
+ acc_sum_outscope_len = 0
197
+ acc_sum_hit_1 = 0
198
+ acc_sum_top_10_len = 0
199
+ acc_full_text_len = 0
200
+
201
+ pre_full_text = ''
202
+ for i, num in enumerate(select_lst):
203
+ base_fn = '%s_%s_forward.json' % (prefix, num)
204
+ result, avg_pick, avg_prob, token_count, sum_pick, sum_prob, sum_outscope_count, sum_outscope_len, sum_hit_1, sum_top_10_len, full_text = calc_details(base_fn)
205
+
206
+ acc_token_count += token_count
207
+ acc_sum_pick += sum_pick
208
+ acc_sum_prob += sum_prob
209
+ acc_sum_outscope_count += sum_outscope_count
210
+ acc_sum_outscope_len += sum_outscope_len
211
+ acc_sum_hit_1 += sum_hit_1
212
+ acc_sum_top_10_len += sum_top_10_len
213
+ acc_full_text_len += len(full_text)
214
+
215
+ if acc_token_count > 0:
216
+ # acc_sum_pick --> top 1~10
217
+ keys_with_auto = acc_sum_pick + acc_sum_outscope_len
218
+ keys_without_auto = acc_full_text_len
219
+ saved_ratio = float(keys_without_auto-keys_with_auto)/keys_without_auto * 100
220
+
221
+ st.text('[ %s ]\n' \
222
+ 'Autocomplete Effectiveness: %.1f%% (ratio of saving keystroke)\n' \
223
+ '(sum) keys_with_auto: %s, top_10_keys: %s, out_of_scope: %s, sum_hit_1: %s\n' \
224
+ 'keys_without_auto: %s, top_10_len: %s, prob: %.2f' % (
225
+ model_names[prefix], saved_ratio,
226
+ '{:,}'.format(keys_with_auto),
227
+ '{:,}'.format(acc_sum_pick),
228
+ '{:,}'.format(acc_sum_outscope_len),
229
+ '{:,}'.format(acc_sum_hit_1),
230
+ '{:,}'.format(keys_without_auto),
231
+ '{:,}'.format(acc_sum_top_10_len),
232
+ acc_sum_prob,
233
+ ))
234
+
235
+ st.text('%s & %.1f\\%% & %s & %s & %s & %s & %s \\\\' % (model_names[prefix], saved_ratio, '{:,}'.format(keys_with_auto), '{:,}'.format(acc_sum_pick), '{:,}'.format(acc_sum_outscope_len), '{:,}'.format(acc_sum_hit_1), '{:,}'.format(keys_without_auto)))
236
+
237
+ # st.text('* acc_token_count =%s --> (avg) hits: %.2f, keys: %.2f, prob: %.2f, outscope: %.2f' % (
238
+ # acc_token_count,
239
+ # float(acc_sum_hit_1)/acc_token_count,
240
+ # float(acc_sum_pick)/acc_token_count,
241
+ # float(acc_sum_prob)/acc_token_count,
242
+ # float(acc_sum_outscope_count)/acc_token_count))
243
+
244
+ def main():
245
+ st.set_page_config( # Alternate names: setup_page, page, layout
246
+ layout="wide", # Can be "centered" or "wide". In the future also "dashboard", etc.
247
+ initial_sidebar_state="auto", # Can be "auto", "expanded", "collapsed"
248
+ page_title="Patent-GPT-J demo", # String or None. Strings get appended with "• Streamlit".
249
+ page_icon=None, # String, anything supported by st.image, or None.
250
+ )
251
+ st.subheader("PatentGPT-J Demo 1 (Autocomplete Effectiveness)")
252
+ st.text("Data coverage: ipg22 (in 2022)")
253
+
254
+ num_set = set()
255
+ fn_lst = glob.glob(os.path.join(folder, '*'))
256
+ for i, fn in enumerate(fn_lst):
257
+ for prefix in prefix_lst:
258
+ v = re.search('(.*?)%s\_(\d+\_\d+)\_(.*?)' % prefix, fn)
259
+ if v is None:
260
+ v = re.search('(.*?)%s\_(\w+\_\d+)\_(.*?)' % prefix, fn)
261
+ if v is None:
262
+ continue
263
+
264
+ v = v.group(2)
265
+ if first_claim_only:
266
+ if v.endswith('_1'):
267
+ num_set.add(v)
268
+ else:
269
+ num_set.add(v)
270
+
271
+ num_lst = list(num_set)
272
+ num_lst.sort()
273
+
274
+ select_lst = []
275
+ for i, num in enumerate(num_lst):
276
+ all_existed = True
277
+ for prefix in prefix_lst:
278
+ fn = os.path.join(folder, '%s_%s_forward.json' % (prefix, num))
279
+ if os.path.exists(fn) == False:
280
+ all_existed = False
281
+ break
282
+ if all_existed:
283
+ select_lst.append(num)
284
+ select_lst.sort()
285
+
286
+ if len(select_lst) == 0:
287
+ st.text('select_lst is empty')
288
+ return
289
+
290
+ if enable_summary_button:
291
+ if st.button('Show Summary'):
292
+ st.text('len(select_lst) = %s' % len(select_lst))
293
+ show_overall_summary(prefix_lst, select_lst)
294
+
295
+ show_patent_lst = [ s.replace('_', ' (claim ') + ')' for s in select_lst]
296
+
297
+ #pick = 0
298
+ pick = random.randrange(len(select_lst))
299
+ if st.button('Random pick'):
300
+ pick = random.randrange(len(select_lst))
301
+ num = select_lst[pick]
302
+ #num = random.choice(select_lst)
303
+
304
+ selected = st.selectbox("Choose a patent claim", show_patent_lst, index=pick)
305
+ num = selected.replace(')', '').replace(' (claim ', '_')
306
+
307
+ #st.text('Selected: %s' % num)
308
+
309
+ avgs = []
310
+ for prefix in prefix_lst:
311
+ base_fn = '%s_%s_forward.json' % (prefix, num)
312
+ one_avg = show_avg(base_fn, model_names[prefix], num)
313
+ if one_avg is not None:
314
+ avgs.append(one_avg)
315
+
316
+ if __name__ == "__main__":
317
+ main()
experiments/non_patent/pgj_d_1024_layer_14_unicorn2_1_forward.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/non_patent/pgj_d_1024_layer_1_unicorn2_1_forward.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/non_patent/pgj_d_1024_layer_2_unicorn2_1_forward.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/non_patent/pgj_d_1024_layer_7_unicorn2_1_forward.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/non_patent/pgj_d_1024_v2_unicorn2_1_forward.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/non_patent/pgj_d_2048_unicorn2_1_forward.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/non_patent/pgj_d_4096_unicorn2_1_forward.json ADDED
The diff for this file is too large to render. See raw diff