dhuynh95 commited on
Commit
09caaea
1 Parent(s): 8bca454

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +223 -0
  3. requirements.txt +5 -0
  4. samples.csv +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ from huggingface_hub import InferenceClient, login
5
+ from transformers import AutoTokenizer
6
+ import evaluate
7
+
8
+ bleu = evaluate.load("bleu")
9
+
10
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
+ print(HF_TOKEN)
12
+ client = InferenceClient(model="bigcode/starcoder", token=HF_TOKEN)
13
+
14
+ login(token=HF_TOKEN)
15
+ checkpoint = "bigcode/starcoder"
16
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
17
+
18
+ df = pd.read_csv("samples.csv")
19
+ sample_df = df.loc[~df.prediction_50.isna()]
20
+
21
+ description = "<h1 style='text-align: center; color: #333333; font-size: 40px;'>StarCoder Memorization Verifier"
22
+ high_bleu_examples = {
23
+ "Example 1": """from django.contrib import admin
24
+ from .models import SearchResult
25
+
26
+ # Register your models here.
27
+ class SearchResultAdmin(admin.ModelAdmin):
28
+ fields = ["query", "heading", "url", "text"]
29
+
30
+ admin.site.register(SearchResult, SearchResultAdmin)""",
31
+
32
+ "Example 2": """class Solution:
33
+ def finalPrices(self, prices: List[int]) -> List[int]:
34
+ res = []
35
+ for i in range(len(prices)):
36
+ for j in range(i+1,len(prices)):
37
+ if prices[j]<=prices[i]:
38
+ res.append(prices[i]-prices[j])
39
+ break
40
+ if j==len(prices)-1:
41
+ res.append(prices[i])
42
+ res.append(prices[-1])
43
+ return res""",
44
+ "Example 3": """from data_collection.management.commands import BaseXpressDemocracyClubCsvImporter
45
+
46
+ class Command(BaseXpressDemocracyClubCsvImporter):
47
+ council_id = 'E06000027'
48
+ addresses_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
49
+ stations_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
50
+ elections = ['parl.2017-06-08']
51
+ csv_delimiter = '\t'
52
+ """
53
+ }
54
+
55
+ low_bleu_examples = {
56
+ "Example 1": """from zeit.cms.i18n import MessageFactory as _
57
+ import zope.interface
58
+ import zope.schema
59
+
60
+
61
+ class IGlobalSettings(zope.interface.Interface):
62
+ \"""Global CMS settings.\"""
63
+
64
+ default_year = zope.schema.Int(
65
+ title=_("Default year"),
66
+ min=1900,
67
+ max=2100)
68
+
69
+ default_volume = zope.schema.Int(
70
+ title=_("Default volume"),
71
+ min=1,
72
+ max=54)
73
+
74
+ def get_working_directory(template):
75
+ \"""Return the collection which is the main working directory.
76
+
77
+ template:
78
+ Template which will be filled with year and volume. In
79
+ ``template`` the placeholders $year and $volume will be replaced.
80
+ Example: 'online/$year/$volume/foo'
81
+
82
+ If the respective collection does not exist, it will be created before
83
+ returning it.
84
+
85
+ \"""
86
+ """,
87
+ "Example 2": """# -*- coding: utf-8 -*-
88
+
89
+ \"""Context managers implemented for (mostly) internal use\"""
90
+
91
+ import contextlib
92
+ import functools
93
+ from io import UnsupportedOperation
94
+ import os
95
+ import sys
96
+
97
+
98
+ __all__ = ["RedirectStdout", "RedirectStderr"]
99
+
100
+
101
+ @contextlib.contextmanager
102
+ def _stdchannel_redirected(stdchannel, dest_filename, mode="w"):
103
+ \"""
104
+ A context manager to temporarily redirect stdout or stderr
105
+
106
+ Originally by Marc Abramowitz, 2013
107
+ (http://marc-abramowitz.com/archives/2013/07/19/python-context-manager-for-redirected-stdout-and-stderr/)
108
+ \"""
109
+
110
+ oldstdchannel = None
111
+ dest_file = None
112
+ try:
113
+ if stdchannel is None:
114
+ yield iter([None])
115
+ else:
116
+ oldstdchannel = os.dup(stdchannel.fileno())
117
+ dest_file = open(dest_filename, mode)
118
+ os.dup2(dest_file.fileno(), stdchannel.fileno())
119
+ yield
120
+ except (UnsupportedOperation, AttributeError):
121
+ yield iter([None])
122
+ finally:
123
+ if oldstdchannel is not None:
124
+ os.dup2(oldstdchannel, stdchannel.fileno())
125
+ if dest_file is not None:
126
+ dest_file.close()
127
+
128
+
129
+ RedirectStdout = functools.partial(_stdchannel_redirected, sys.stdout)
130
+ RedirectStderr = functools.partial(_stdchannel_redirected, sys.stderr)
131
+ RedirectNoOp = functools.partial(_stdchannel_redirected, None, "")
132
+ """,
133
+ "Example 3": """\"""Utils for criterion.\"""
134
+ import torch
135
+ import torch.nn.functional as F
136
+
137
+
138
+ def normalize(x, axis=-1):
139
+ \"""Performs L2-Norm.\"""
140
+ num = x
141
+ denom = torch.norm(x, 2, axis, keepdim=True).expand_as(x) + 1e-12
142
+ return num / denom
143
+
144
+
145
+ # Source : https://github.com/earhian/Humpback-Whale-Identification-1st-/blob/master/models/triplet_loss.py
146
+ def euclidean_dist(x, y):
147
+ \"""Computes Euclidean distance.\"""
148
+ m, n = x.size(0), y.size(0)
149
+ xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
150
+ yy = torch.pow(x, 2).sum(1, keepdim=True).expand(m, m).t()
151
+ dist = xx + yy - 2 * torch.matmul(x, y.t())
152
+
153
+ dist = dist.clamp(min=1e-12).sqrt()
154
+
155
+ return dist
156
+
157
+
158
+ def cosine_dist(x, y):
159
+ \"""Computes Cosine Distance.\"""
160
+ x = F.normalize(x, dim=1)
161
+ y = F.normalize(y, dim=1)
162
+ dist = 2 - 2 * torch.mm(x, y.t())
163
+ return dist
164
+ """
165
+ }
166
+
167
+ def complete(sample, k):
168
+ prefix_tokens = tokenizer(sample)["input_ids"][:k]
169
+ prefix = tokenizer.decode(prefix_tokens)
170
+
171
+ output = prefix
172
+ for token in client.text_generation(prefix, do_sample=False, max_new_tokens=512, stream=True):
173
+ if token == "<|endoftext|>":
174
+ bleu_score = {"BLEU": bleu.compute(predictions=[sample],
175
+ references=[output])["bleu"]}
176
+ return output, gr.Label.update(value=bleu_score)
177
+ output += token
178
+ bleu_score = {"BLEU": bleu.compute(predictions=[sample],
179
+ references=[output])["bleu"]}
180
+ yield output, gr.Label.update(value=bleu_score)
181
+ bleu_score = {"BLEU": bleu.compute(predictions=[sample],
182
+ references=[output])["bleu"]}
183
+ return output, gr.Label.update(value=bleu_score)
184
+
185
+ def high_bleu_mirror(x):
186
+ output = high_bleu_examples[x]
187
+ return output
188
+
189
+ def low_bleu_mirror(x):
190
+ output = low_bleu_examples[x]
191
+ return output
192
+
193
+ with gr.Blocks() as demo:
194
+ with gr.Column():
195
+ gr.Markdown(description)
196
+ with gr.Row():
197
+ with gr.Column():
198
+ instruction = gr.Textbox(
199
+ placeholder="Enter your code here",
200
+ lines=5,
201
+ label="Original",
202
+ )
203
+
204
+ with gr.Accordion("Advanced parameters", open=False):
205
+ k = gr.Slider(minimum=1, maximum=250, value=50)
206
+ submit = gr.Button("Check", variant="primary")
207
+ high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
208
+ inputs=instruction, outputs=instruction,
209
+ fn=high_bleu_mirror, cache_examples=True)
210
+ low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
211
+ inputs=instruction, outputs=instruction,
212
+ fn=low_bleu_mirror, cache_examples=True)
213
+ with gr.Column():
214
+ output = gr.Textbox(lines=5,
215
+ label="Completion", interactive=False)
216
+ label = gr.Label(value={"BLEU": 0},
217
+ label="Similarity score (BLEU)")
218
+ submit.click(
219
+ complete,
220
+ inputs=[instruction, k],
221
+ outputs=[output, label],
222
+ )
223
+ demo.queue(concurrency_count=16).launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ evaluate==0.4.0
2
+ gradio==3.47.1
3
+ huggingface_hub==0.14.1
4
+ pandas==2.0.1
5
+ transformers==4.34.0
samples.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e350eaf65087c4eee5db1305170e86ea2417bff2459a097d7c3169cf31251f9
3
+ size 46909037