NamCyan commited on
Commit
ca8e721
·
1 Parent(s): 7c1ff62

Delete test.ipynb

Browse files
Files changed (1) hide show
  1. test.ipynb +0 -186
test.ipynb DELETED
@@ -1,186 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 2,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import os \n",
10
- "from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 3,
16
- "metadata": {},
17
- "outputs": [],
18
- "source": [
19
- "model_name_or_path = \"/datadrive/namlh31/codebridge/Codebert-docstring-inconsistency\"\n",
20
- "config = AutoConfig.from_pretrained(\n",
21
- " model_name_or_path,\n",
22
- ")\n",
23
- "tokenizer = AutoTokenizer.from_pretrained(\n",
24
- " model_name_or_path\n",
25
- ")\n",
26
- "model = AutoModelForSequenceClassification.from_pretrained(\n",
27
- "model_name_or_path,\n",
28
- "config=config,\n",
29
- ")"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": 5,
35
- "metadata": {},
36
- "outputs": [],
37
- "source": [
38
- "examples = {'code': \"function(str){\\r\\n var ret = new Array(str.length), len = str.length;\\r\\n while(len--) ret[len] = str.charCodeAt(len);\\r\\n return Uint8Array.from(ret);\\r\\n}\",\n",
39
- " 'docstring': 'we do not need Buffer pollyfill for now'}"
40
- ]
41
- },
42
- {
43
- "cell_type": "code",
44
- "execution_count": 17,
45
- "metadata": {},
46
- "outputs": [],
47
- "source": [
48
- "texts = (\n",
49
- " (examples['docstring'], examples['code'])\n",
50
- " )\n",
51
- "result = tokenizer(*texts, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')"
52
- ]
53
- },
54
- {
55
- "cell_type": "code",
56
- "execution_count": 10,
57
- "metadata": {},
58
- "outputs": [
59
- {
60
- "name": "stdout",
61
- "output_type": "stream",
62
- "text": [
63
- "512\n"
64
- ]
65
- }
66
- ],
67
- "source": [
68
- "tokenizer.decode(result['input_ids'])\n",
69
- "print(len(result['input_ids']))"
70
- ]
71
- },
72
- {
73
- "cell_type": "code",
74
- "execution_count": 22,
75
- "metadata": {},
76
- "outputs": [],
77
- "source": [
78
- "input = \"\"\"we do not need Buffer pollyfill for now</s></s>function(str){\\r\\n var ret = new Array(str.length), len = str.length;\\r\\n while(len--) ret[len] = str.charCodeAt(len);\\r\\n return Uint8Array.from(ret);\\r\\n}\"\"\"\n",
79
- "rs_2 = tokenizer(input, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')"
80
- ]
81
- },
82
- {
83
- "cell_type": "code",
84
- "execution_count": 23,
85
- "metadata": {},
86
- "outputs": [
87
- {
88
- "data": {
89
- "text/plain": [
90
- "SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2598, -0.2636]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)"
91
- ]
92
- },
93
- "execution_count": 23,
94
- "metadata": {},
95
- "output_type": "execute_result"
96
- }
97
- ],
98
- "source": [
99
- "model(**rs_2)"
100
- ]
101
- },
102
- {
103
- "cell_type": "code",
104
- "execution_count": 24,
105
- "metadata": {},
106
- "outputs": [
107
- {
108
- "name": "stdout",
109
- "output_type": "stream",
110
- "text": [
111
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
112
- "To disable this warning, you can either:\n",
113
- "\t- Avoid using `tokenizers` before the fork if possible\n",
114
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
115
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
116
- "To disable this warning, you can either:\n",
117
- "\t- Avoid using `tokenizers` before the fork if possible\n",
118
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
119
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
120
- "To disable this warning, you can either:\n",
121
- "\t- Avoid using `tokenizers` before the fork if possible\n",
122
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
123
- ]
124
- }
125
- ],
126
- "source": [
127
- "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n",
128
- "import torch\n",
129
- "device = 0 if torch.cuda.is_available() else -1\n",
130
- "pipeline = pipeline(\"text-classification\", model=model, tokenizer=tokenizer, device=device)"
131
- ]
132
- },
133
- {
134
- "cell_type": "code",
135
- "execution_count": 28,
136
- "metadata": {},
137
- "outputs": [
138
- {
139
- "name": "stdout",
140
- "output_type": "stream",
141
- "text": [
142
- "[{'label': 'Inconsistency', 'score': 0.5601343512535095}]\n"
143
- ]
144
- }
145
- ],
146
- "source": [
147
- "inputs = \"\"\"we do not need Buffer pollyfill for now</s></s>function(str){\n",
148
- " var ret = new Array(str.length), len = str.length;\n",
149
- " while(len--) ret[len] = str.charCodeAt(len);\n",
150
- " return Uint8Array.from(ret);\n",
151
- "}\"\"\"\n",
152
- "prediction = pipeline(inputs)\n",
153
- "print(prediction)"
154
- ]
155
- },
156
- {
157
- "cell_type": "code",
158
- "execution_count": null,
159
- "metadata": {},
160
- "outputs": [],
161
- "source": []
162
- }
163
- ],
164
- "metadata": {
165
- "kernelspec": {
166
- "display_name": "namlh31",
167
- "language": "python",
168
- "name": "python3"
169
- },
170
- "language_info": {
171
- "codemirror_mode": {
172
- "name": "ipython",
173
- "version": 3
174
- },
175
- "file_extension": ".py",
176
- "mimetype": "text/x-python",
177
- "name": "python",
178
- "nbconvert_exporter": "python",
179
- "pygments_lexer": "ipython3",
180
- "version": "3.11.2"
181
- },
182
- "orig_nbformat": 4
183
- },
184
- "nbformat": 4,
185
- "nbformat_minor": 2
186
- }