hyomin commited on
Commit
c26657b
1 Parent(s): 7cd5399

Upload 16 files

Browse files
app.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from konlpy.tag import Okt
4
+ from string import whitespace, punctuation
5
+ import re
6
+ import unicodedata
7
+ from sentence_transformers import SentenceTransformer, util
8
+ import gradio as gr
9
+
10
+ import pytorch_lightning as pl
11
+ import torch
12
+
13
+ from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
14
+ from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
15
+ from transformers.optimization import get_cosine_schedule_with_warmup
16
+ from torch.utils.data import DataLoader, Dataset
17
+ from konlpy.tag import Okt
18
+
19
+ # classification
20
+
21
+
22
+ def CleanEnd(text):
23
+ email = re.compile(
24
+ r'[-_0-9a-z]+@[-_0-9a-z]+(?:\.[0-9a-z]+)+', flags=re.IGNORECASE)
25
+ url = re.compile(
26
+ r'(?:https?:\/\/)?[-_0-9a-z]+(?:\.[-_0-9a-z]+)+', flags=re.IGNORECASE)
27
+ etc = re.compile(
28
+ r'\.([^\.]*(?:기자|특파원|교수|작가|대표|논설|고문|주필|부문장|팀장|장관|원장|연구원|이사장|위원|실장|차장|부장|에세이|화백|사설|소장|단장|과장|기획자|큐레이터|저작권|평론가|©|©|ⓒ|\@|\/|=|▶|무단|전재|재배포|금지|\[|\]|\(\))[^\.]*)$')
29
+ bracket = re.compile(r'^((?:\[.+\])|(?:【.+】)|(?:<.+>)|(?:◆.+◆)\s)')
30
+
31
+ result = email.sub('', text)
32
+ result = url.sub('', result)
33
+ result = etc.sub('.', result)
34
+ result = bracket.sub('', result).strip()
35
+ return result
36
+
37
+
38
+ def TextFilter(text):
39
+ punct = ''.join([chr for chr in punctuation if chr != '%'])
40
+ filtering = re.compile(f'[{whitespace}{punct}]+')
41
+ onlyText = re.compile(r'[^\% ㄱ-ㅣ가-힣]+')
42
+ result = filtering.sub(' ', text)
43
+ result = onlyText.sub(' ', result).strip()
44
+ result = filtering.sub(' ', result)
45
+ return result
46
+
47
+
48
+ def is_clickbait(title, content, threshold=0.815):
49
+ model = SentenceTransformer(
50
+ './model/onlineContrastive')
51
+
52
+ pattern_whitespace = re.compile(f'[{whitespace}]+')
53
+ title = unicodedata.normalize('NFC', re.sub(
54
+ pattern_whitespace, ' ', title)).strip()
55
+ title = CleanEnd(title)
56
+ title = TextFilter(title)
57
+
58
+ content = unicodedata.normalize('NFC', re.sub(
59
+ pattern_whitespace, ' ', content)).strip()
60
+ content = CleanEnd(content)
61
+ content = TextFilter(content)
62
+
63
+ # Noun Extraction
64
+ okt = Okt()
65
+ title = ' '.join(okt.nouns(title))
66
+ content = ' '.join(okt.nouns(content))
67
+
68
+ # Compute embedding
69
+ embeddings1 = model.encode(title, convert_to_tensor=True)
70
+ embeddings2 = model.encode(content, convert_to_tensor=True)
71
+
72
+ # Compute cosine-similarities
73
+ cosine_score = util.cos_sim(embeddings1, embeddings2)
74
+ similarity = cosine_score.numpy()[0][0]
75
+
76
+ if similarity < threshold:
77
+ return 0, similarity # clickbait
78
+ else:
79
+ return 1, similarity # non-clickbait
80
+
81
+ # Generation
82
+
83
+
84
+ df_train = pd.DataFrame()
85
+ df_train['input_text'] = ['1', '2']
86
+ df_train['target_text'] = ['1', '2']
87
+
88
+
89
+ def CleanEnd_g(text):
90
+ email = re.compile(
91
+ r'[-_0-9a-z]+@[-_0-9a-z]+(?:\.[0-9a-z]+)+', flags=re.IGNORECASE)
92
+ # url = re.compile(r'(?:https?:\/\/)?[-_0-9a-z]+(?:\.[-_0-9a-z]+)+', flags=re.IGNORECASE)
93
+ # etc = re.compile(r'\.([^\.]*(?:기자|특파원|교수|작가|대표|논설|고문|주필|부문장|팀장|장관|원장|연구원|이사장|위원|실장|차장|부장|에세이|화백|사설|소장|단장|과장|기획자|큐레이터|저작권|평론가|©|©|ⓒ|\@|\/|=|▶|무단|전재|재배포|금지|\[|\]|\(\))[^\.]*)$')
94
+ # bracket = re.compile(r'^((?:\[.+\])|(?:【.+】)|(?:<.+>)|(?:◆.+◆)\s)')
95
+
96
+ result = email.sub('', text)
97
+ # result = url.sub('', result)
98
+ # result = etc.sub('.', result)
99
+ # result = bracket.sub('', result).strip()
100
+ return result
101
+
102
+
103
+ class DatasetFromDataframe(Dataset):
104
+ def __init__(self, df, dataset_args):
105
+ self.data = df
106
+ self.max_length = dataset_args['max_length']
107
+ self.tokenizer = dataset_args['tokenizer']
108
+ self.start_token = '<s>'
109
+ self.end_token = '</s>'
110
+
111
+ def __len__(self):
112
+ return len(self.data)
113
+
114
+ def create_tokens(self, text):
115
+ tokens = self.tokenizer.encode(
116
+ self.start_token + text + self.end_token)
117
+
118
+ tokenLength = len(tokens)
119
+ remain = self.max_length - tokenLength
120
+
121
+ if remain >= 0:
122
+ tokens = tokens + [self.tokenizer.pad_token_id] * remain
123
+ attention_mask = [1] * tokenLength + [0] * remain
124
+ else:
125
+ tokens = tokens[: self.max_length - 1] + \
126
+ self.tokenizer.encode(self.end_token)
127
+ attention_mask = [1] * self.max_length
128
+
129
+ return tokens, attention_mask
130
+
131
+ def __getitem__(self, index):
132
+ record = self.data.iloc[index]
133
+
134
+ question, answer = record['input_text'], record['target_text']
135
+
136
+ input_id, input_mask = self.create_tokens(question)
137
+ output_id, output_mask = self.create_tokens(answer)
138
+
139
+ label = output_id[1:(self.max_length + 1)]
140
+ label = label + (self.max_length - len(label)) * [-100]
141
+
142
+ return {
143
+ 'input_ids': torch.LongTensor(input_id),
144
+ 'attention_mask': torch.LongTensor(input_mask),
145
+ 'decoder_input_ids': torch.LongTensor(output_id),
146
+ 'decoder_attention_mask': torch.LongTensor(output_mask),
147
+ "labels": torch.LongTensor(label)
148
+ }
149
+
150
+
151
+ class OneSourceDataModule(pl.LightningDataModule):
152
+ def __init__(
153
+ self,
154
+ **kwargs
155
+ ):
156
+ super().__init__()
157
+
158
+ self.data = kwargs.get('data')
159
+ self.dataset_args = kwargs.get("dataset_args")
160
+ self.batch_size = kwargs.get("batch_size") or 32
161
+ self.train_size = kwargs.get("train_size") or 0.9
162
+
163
+ def setup(self, stage=""):
164
+ # trainset, testset = train_test_split(df_train, train_size=self.train_size, shuffle=True)
165
+ self.trainset = DatasetFromDataframe(df_train, self.dataset_args)
166
+ self.testset = DatasetFromDataframe(df_train, self.dataset_args)
167
+
168
+ def train_dataloader(self):
169
+ train = DataLoader(
170
+ self.trainset,
171
+ batch_size=self.batch_size
172
+ )
173
+ return train
174
+
175
+ def val_dataloader(self):
176
+ val = DataLoader(
177
+ self.testset,
178
+ batch_size=self.batch_size
179
+ )
180
+ return val
181
+
182
+ def test_dataloader(self):
183
+ test = DataLoader(
184
+ self.testset,
185
+ batch_size=self.batch_size
186
+ )
187
+ return test
188
+
189
+
190
+ class KoBARTConditionalGeneration(pl.LightningModule):
191
+ def __init__(self, hparams, **kwargs):
192
+ super(KoBARTConditionalGeneration, self).__init__()
193
+ self.hparams.update(hparams)
194
+
195
+ self.model = kwargs['model']
196
+ self.tokenizer = kwargs['tokenizer']
197
+
198
+ self.model.train()
199
+
200
+ def configure_optimizers(self):
201
+ param_optimizer = list(self.model.named_parameters())
202
+ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
203
+
204
+ optimizer_grouped_parameters = [{
205
+ 'params': [
206
+ p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
207
+ ],
208
+ 'weight_decay': 0.01
209
+ }, {
210
+ 'params': [
211
+ p for n, p in param_optimizer if any(nd in n for nd in no_decay)
212
+ ],
213
+ 'weight_decay': 0.0
214
+ }]
215
+
216
+ optimizer = torch.optim.AdamW(
217
+ optimizer_grouped_parameters,
218
+ lr=self.hparams.lr
219
+ )
220
+
221
+ # num_workers = gpus * num_nodes
222
+ data_len = len(self.train_dataloader().dataset)
223
+ print(f'학습 데이터 양: {data_len}')
224
+
225
+ num_train_steps = int(
226
+ data_len / self.hparams.batch_size * self.hparams.max_epochs)
227
+ print(f'Step 수: {num_train_steps}')
228
+
229
+ num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
230
+ print(f'Warmup Step 수: {num_warmup_steps}')
231
+
232
+ scheduler = get_cosine_schedule_with_warmup(
233
+ optimizer,
234
+ num_warmup_steps=num_warmup_steps,
235
+ num_training_steps=num_train_steps
236
+ )
237
+
238
+ lr_scheduler = {
239
+ 'scheduler': scheduler,
240
+ 'monitor': 'loss',
241
+ 'interval': 'step',
242
+ 'frequency': 1
243
+ }
244
+
245
+ return [optimizer], [lr_scheduler]
246
+
247
+ def forward(self, inputs):
248
+ return self.model(
249
+ input_ids=inputs['input_ids'],
250
+ attention_mask=inputs['attention_mask'],
251
+ decoder_input_ids=inputs['decoder_input_ids'],
252
+ decoder_attention_mask=inputs['decoder_attention_mask'],
253
+ labels=inputs['labels'],
254
+ return_dict=True
255
+ )
256
+
257
+ def training_step(self, batch, batch_idx):
258
+ loss = self(batch).loss
259
+ return loss
260
+
261
+ def validation_step(self, batch, batch_idx):
262
+ loss = self(batch).loss
263
+
264
+ def test(self, text):
265
+ tokens = self.tokenizer.encode("<s>" + text + "</s>")
266
+
267
+ tokenLength = len(tokens)
268
+ remain = self.hparams.max_length - tokenLength
269
+
270
+ if remain >= 0:
271
+ tokens = tokens + [self.tokenizer.pad_token_id] * remain
272
+ attention_mask = [1] * tokenLength + [0] * remain
273
+ else:
274
+ tokens = tokens[: self.hparams.max_length - 1] + \
275
+ self.tokenizer.encode("</s>")
276
+ attention_mask = [1] * self.hparams.max_length
277
+
278
+ tokens = torch.LongTensor([tokens])
279
+ attention_mask = torch.LongTensor([attention_mask])
280
+ self.model = self.model
281
+
282
+ result = self.model.generate(
283
+ tokens,
284
+ max_length=self.hparams.max_length,
285
+ attention_mask=attention_mask,
286
+ num_beams=10
287
+ )[0]
288
+
289
+ a = self.tokenizer.decode(result)
290
+ return a
291
+
292
+
293
+ def generation(szContent):
294
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(
295
+ "gogamza/kobart-summarization")
296
+ model1 = BartForConditionalGeneration.from_pretrained(
297
+ "gogamza/kobart-summarization")
298
+ if len(szContent) > 500:
299
+ input_ids = tokenizer.encode(szContent[:500], return_tensors="pt")
300
+ else:
301
+ input_ids = tokenizer.encode(szContent, return_tensors="pt")
302
+
303
+ summary = model1.generate(
304
+ input_ids=input_ids,
305
+ bos_token_id=model1.config.bos_token_id,
306
+ eos_token_id=model1.config.eos_token_id,
307
+ length_penalty=.3, # bigger than 1= longer, smaller than 1=shorter summary
308
+ max_length=35,
309
+ min_length=25,
310
+ num_beams=5)
311
+ szSummary = tokenizer.decode(summary[0], skip_special_tokens=True)
312
+ print(szSummary)
313
+ KoBARTModel = BartForConditionalGeneration.from_pretrained(
314
+ './model/final2.h5')
315
+ BATCH_SIZE = 32
316
+ MAX_LENGTH = 128
317
+ EPOCHS = 0
318
+ model2 = KoBARTConditionalGeneration({
319
+ "lr": 5e-6,
320
+ "warmup_ratio": 0.1,
321
+ "batch_size": BATCH_SIZE,
322
+ "max_length": MAX_LENGTH,
323
+ "max_epochs": EPOCHS
324
+ },
325
+ tokenizer=tokenizer,
326
+ model=KoBARTModel
327
+ )
328
+ dm = OneSourceDataModule(
329
+ data=df_train,
330
+ batch_size=BATCH_SIZE,
331
+ train_size=0.9,
332
+ dataset_args={
333
+ "tokenizer": tokenizer,
334
+ "max_length": MAX_LENGTH,
335
+ }
336
+ )
337
+ trainer = pl.Trainer(
338
+ max_epochs=EPOCHS,
339
+ gpus=0
340
+ )
341
+
342
+ trainer.fit(model2, dm)
343
+ szTitle = model2.test(szSummary)
344
+ df = pd.DataFrame()
345
+ df['newTitle'] = [szTitle]
346
+ df['content'] = [szContent]
347
+ # White space, punctuation removal
348
+ pattern_whitespace = re.compile(f'[{whitespace}]+')
349
+ df['newTitle'] = df.newTitle.fillna('').replace(pattern_whitespace, ' ').map(
350
+ lambda x: unicodedata.normalize('NFC', x)).str.strip()
351
+ df['newTitle'] = df.newTitle.map(CleanEnd_g)
352
+ df['newTitle'] = df.newTitle.map(TextFilter)
353
+ return df.newTitle[0]
354
+
355
+
356
+ def new_headline(title, content):
357
+ label = is_clickbait(title, content)
358
+ if label[0] == 0:
359
+ return generation(content)
360
+ elif label[0] == 1:
361
+ return '낚시성 기사가 아닙니다.'
362
+
363
+
364
+ # gradio
365
+ with gr.Blocks() as demo1:
366
+ gr.Markdown(
367
+ """
368
+ <h1 align="center">
369
+ clickbait news classifier and new headline generator
370
+ </h1>
371
+ """)
372
+
373
+ gr.Markdown(
374
+ """
375
+ 뉴스 기사 제목과 본문을 입력하면 낚시성 기사인지 분류하고,
376
+ 낚시성 기사이면 새로운 제목을 생성해주는 프로그램입니다.
377
+ """)
378
+
379
+ with gr.Row():
380
+ with gr.Column():
381
+ inputs = [gr.Textbox(placeholder="뉴스기사 제목을 입력해주세요", label='headline'),
382
+ gr.Textbox(
383
+ lines=10, placeholder="뉴스기사 본문을 입력해주세요", label='content')]
384
+ with gr.Row():
385
+ btn = gr.Button("결과 출력")
386
+ with gr.Column():
387
+ output = gr.Text(label='Result')
388
+ btn.click(fn=new_headline, inputs=inputs, outputs=output)
389
+
390
+ if __name__ == "__main__":
391
+ demo1.launch()
model/final2.h5/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/content/drive/My Drive/23 U 4-1/\ud14d\uc2a4\ud2b8\ub9c8\uc774\ub2dd/\uae30\ub9d0\ud504\ub85c\uc81d\ud2b8/final2.h5",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "author": "Heewon Jeon(madjakarta@gmail.com)",
12
+ "bos_token_id": 1,
13
+ "classif_dropout": 0.1,
14
+ "classifier_dropout": 0.1,
15
+ "d_model": 768,
16
+ "decoder_attention_heads": 16,
17
+ "decoder_ffn_dim": 3072,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 1,
21
+ "do_blenderbot_90_layernorm": false,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 3072,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 1,
28
+ "extra_pos_embeddings": 2,
29
+ "force_bos_token_to_be_generated": false,
30
+ "forced_eos_token_id": 1,
31
+ "gradient_checkpointing": false,
32
+ "id2label": {
33
+ "0": "NEGATIVE",
34
+ "1": "POSITIVE"
35
+ },
36
+ "init_std": 0.02,
37
+ "is_encoder_decoder": true,
38
+ "kobart_version": 2.0,
39
+ "label2id": {
40
+ "NEGATIVE": 0,
41
+ "POSITIVE": 1
42
+ },
43
+ "max_position_embeddings": 1026,
44
+ "model_type": "bart",
45
+ "normalize_before": false,
46
+ "normalize_embedding": true,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 3,
49
+ "scale_embedding": false,
50
+ "static_position_embeddings": false,
51
+ "tokenizer_class": "PreTrainedTokenizerFast",
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.30.1",
54
+ "use_cache": true,
55
+ "vocab_size": 30000
56
+ }
model/final2.h5/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "decoder_start_token_id": 1,
5
+ "eos_token_id": 1,
6
+ "forced_eos_token_id": 1,
7
+ "pad_token_id": 3,
8
+ "transformers_version": "4.30.1"
9
+ }
model/final2.h5/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf59473330d28a08bc91af6a2aadca7ffdfc67aabe5af8a0e337532744d491dd
3
+ size 495644701
model/onlineContrastive/1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
model/onlineContrastive/README.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ - transformers
8
+
9
+ ---
10
+
11
+ # {MODEL_NAME}
12
+
13
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
14
+
15
+ <!--- Describe your model here -->
16
+
17
+ ## Usage (Sentence-Transformers)
18
+
19
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
20
+
21
+ ```
22
+ pip install -U sentence-transformers
23
+ ```
24
+
25
+ Then you can use the model like this:
26
+
27
+ ```python
28
+ from sentence_transformers import SentenceTransformer
29
+ sentences = ["This is an example sentence", "Each sentence is converted"]
30
+
31
+ model = SentenceTransformer('{MODEL_NAME}')
32
+ embeddings = model.encode(sentences)
33
+ print(embeddings)
34
+ ```
35
+
36
+
37
+
38
+ ## Usage (HuggingFace Transformers)
39
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
40
+
41
+ ```python
42
+ from transformers import AutoTokenizer, AutoModel
43
+ import torch
44
+
45
+
46
+ #Mean Pooling - Take attention mask into account for correct averaging
47
+ def mean_pooling(model_output, attention_mask):
48
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
49
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
50
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
51
+
52
+
53
+ # Sentences we want sentence embeddings for
54
+ sentences = ['This is an example sentence', 'Each sentence is converted']
55
+
56
+ # Load model from HuggingFace Hub
57
+ tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
58
+ model = AutoModel.from_pretrained('{MODEL_NAME}')
59
+
60
+ # Tokenize sentences
61
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
62
+
63
+ # Compute token embeddings
64
+ with torch.no_grad():
65
+ model_output = model(**encoded_input)
66
+
67
+ # Perform pooling. In this case, mean pooling.
68
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
69
+
70
+ print("Sentence embeddings:")
71
+ print(sentence_embeddings)
72
+ ```
73
+
74
+
75
+
76
+ ## Evaluation Results
77
+
78
+ <!--- Describe how your model was evaluated -->
79
+
80
+ For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
81
+
82
+
83
+ ## Training
84
+ The model was trained with the parameters:
85
+
86
+ **DataLoader**:
87
+
88
+ `torch.utils.data.dataloader.DataLoader` of length 1822 with parameters:
89
+ ```
90
+ {'batch_size': 128, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
91
+ ```
92
+
93
+ **Loss**:
94
+
95
+ `sentence_transformers.losses.OnlineContrastiveLoss.OnlineContrastiveLoss`
96
+
97
+ Parameters of the fit()-Method:
98
+ ```
99
+ {
100
+ "epochs": 5,
101
+ "evaluation_steps": 182,
102
+ "evaluator": "sentence_transformers.evaluation.BinaryClassificationEvaluator.BinaryClassificationEvaluator",
103
+ "max_grad_norm": 1,
104
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
105
+ "optimizer_params": {
106
+ "lr": 2e-05
107
+ },
108
+ "scheduler": "WarmupLinear",
109
+ "steps_per_epoch": null,
110
+ "warmup_steps": 911,
111
+ "weight_decay": 0.01
112
+ }
113
+ ```
114
+
115
+
116
+ ## Full Model Architecture
117
+ ```
118
+ SentenceTransformer(
119
+ (0): Transformer({'max_seq_length': 128, 'do_lower_case': True}) with Transformer model: RobertaModel
120
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
121
+ )
122
+ ```
123
+
124
+ ## Citing & Authors
125
+
126
+ <!--- Describe where people can find more information -->
model/onlineContrastive/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "klue/roberta-base",
3
+ "architectures": [
4
+ "RobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "tokenizer_class": "BertTokenizer",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.29.2",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 32000
29
+ }
model/onlineContrastive/config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.29.2",
5
+ "pytorch": "2.0.1+cu118"
6
+ }
7
+ }
model/onlineContrastive/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
model/onlineContrastive/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f33199a31e10b0c6bf79b4b624ad62a9759e9684df10242be30e675f1c6967e
3
+ size 442543661
model/onlineContrastive/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 128,
3
+ "do_lower_case": true
4
+ }
model/onlineContrastive/special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
model/onlineContrastive/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/onlineContrastive/tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_basic_tokenize": true,
6
+ "do_lower_case": false,
7
+ "eos_token": "[SEP]",
8
+ "mask_token": "[MASK]",
9
+ "model_max_length": 512,
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "strip_accents": null,
14
+ "tokenize_chinese_chars": true,
15
+ "tokenizer_class": "BertTokenizer",
16
+ "unk_token": "[UNK]"
17
+ }
model/onlineContrastive/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ konlpy
4
+ sentence_transformers
5
+ transformers
6
+ pytorch_lightning==1.4.9
7
+ torchmetrics==0.6.0
8
+ torchtext==0.6.0
9
+ transformers[sentencepiece]
10
+ torch
11
+ tensorflow