Spaces:

hyomin
/

clickbait_classifier_and_generator

Sleeping

App Files Files Community

hyomin commited on Jun 12, 2023

Commit

c26657b

•

1 Parent(s): 7cd5399

Upload 16 files

Browse files

Files changed (16) hide show

app.py +391 -0
model/final2.h5/config.json +56 -0
model/final2.h5/generation_config.json +9 -0
model/final2.h5/pytorch_model.bin +3 -0
model/onlineContrastive/1_Pooling/config.json +7 -0
model/onlineContrastive/README.md +126 -0
model/onlineContrastive/config.json +29 -0
model/onlineContrastive/config_sentence_transformers.json +7 -0
model/onlineContrastive/modules.json +14 -0
model/onlineContrastive/pytorch_model.bin +3 -0
model/onlineContrastive/sentence_bert_config.json +4 -0
model/onlineContrastive/special_tokens_map.json +9 -0
model/onlineContrastive/tokenizer.json +0 -0
model/onlineContrastive/tokenizer_config.json +17 -0
model/onlineContrastive/vocab.txt +0 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import pandas as pd
+import numpy as np
+from konlpy.tag import Okt
+from string import whitespace, punctuation
+import re
+import unicodedata
+from sentence_transformers import SentenceTransformer, util
+import gradio as gr
+import pytorch_lightning as pl
+import torch
+from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
+from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
+from transformers.optimization import get_cosine_schedule_with_warmup
+from torch.utils.data import DataLoader, Dataset
+from konlpy.tag import Okt
+# classification
+def CleanEnd(text):
+    email = re.compile(
+        r'[-_0-9a-z]+@[-_0-9a-z]+(?:\.[0-9a-z]+)+', flags=re.IGNORECASE)
+    url = re.compile(
+        r'(?:https?:\/\/)?[-_0-9a-z]+(?:\.[-_0-9a-z]+)+', flags=re.IGNORECASE)
+    etc = re.compile(
+        r'\.([^\.]*(?:기자|특파원|교수|작가|대표|논설|고문|주필|부문장|팀장|장관|원장|연구원|이사장|위원|실장|차장|부장|에세이|화백|사설|소장|단장|과장|기획자|큐레이터|저작권|평론가|©|©|ⓒ|\@|\/|=|▶|무단|전재|재배포|금지|\[|\]|\(\))[^\.]*)$')
+    bracket = re.compile(r'^((?:\[.+\])|(?:【.+】)|(?:<.+>)|(?:◆.+◆)\s)')
+    result = email.sub('', text)
+    result = url.sub('', result)
+    result = etc.sub('.', result)
+    result = bracket.sub('', result).strip()
+    return result
+def TextFilter(text):
+    punct = ''.join([chr for chr in punctuation if chr != '%'])
+    filtering = re.compile(f'[{whitespace}{punct}]+')
+    onlyText = re.compile(r'[^\% ㄱ-ㅣ가-힣]+')
+    result = filtering.sub(' ', text)
+    result = onlyText.sub(' ', result).strip()
+    result = filtering.sub(' ', result)
+    return result
+def is_clickbait(title, content, threshold=0.815):
+    model = SentenceTransformer(
+        './model/onlineContrastive')
+    pattern_whitespace = re.compile(f'[{whitespace}]+')
+    title = unicodedata.normalize('NFC', re.sub(
+        pattern_whitespace, ' ', title)).strip()
+    title = CleanEnd(title)
+    title = TextFilter(title)
+    content = unicodedata.normalize('NFC', re.sub(
+        pattern_whitespace, ' ', content)).strip()
+    content = CleanEnd(content)
+    content = TextFilter(content)
+    # Noun Extraction
+    okt = Okt()
+    title = ' '.join(okt.nouns(title))
+    content = ' '.join(okt.nouns(content))
+    # Compute embedding
+    embeddings1 = model.encode(title, convert_to_tensor=True)
+    embeddings2 = model.encode(content, convert_to_tensor=True)
+    # Compute cosine-similarities
+    cosine_score = util.cos_sim(embeddings1, embeddings2)
+    similarity = cosine_score.numpy()[0][0]
+    if similarity < threshold:
+        return 0, similarity    # clickbait
+    else:
+        return 1, similarity    # non-clickbait
+# Generation
+df_train = pd.DataFrame()
+df_train['input_text'] = ['1', '2']
+df_train['target_text'] = ['1', '2']
+def CleanEnd_g(text):
+    email = re.compile(
+        r'[-_0-9a-z]+@[-_0-9a-z]+(?:\.[0-9a-z]+)+', flags=re.IGNORECASE)
+    # url = re.compile(r'(?:https?:\/\/)?[-_0-9a-z]+(?:\.[-_0-9a-z]+)+', flags=re.IGNORECASE)
+    # etc = re.compile(r'\.([^\.]*(?:기자|특파원|교수|작가|대표|논설|고문|주필|부문장|팀장|장관|원장|연구원|이사장|위원|실장|차장|부장|에세이|화백|사설|소장|단장|과장|기획자|큐레이터|저작권|평론가|©|©|ⓒ|\@|\/|=|▶|무단|전재|재배포|금지|\[|\]|\(\))[^\.]*)$')
+    # bracket = re.compile(r'^((?:\[.+\])|(?:【.+】)|(?:<.+>)|(?:◆.+◆)\s)')
+    result = email.sub('', text)
+    # result = url.sub('', result)
+    # result = etc.sub('.', result)
+    # result = bracket.sub('', result).strip()
+    return result
+class DatasetFromDataframe(Dataset):
+    def __init__(self, df, dataset_args):
+        self.data = df
+        self.max_length = dataset_args['max_length']
+        self.tokenizer = dataset_args['tokenizer']
+        self.start_token = '<s>'
+        self.end_token = '</s>'
+    def __len__(self):
+        return len(self.data)
+    def create_tokens(self, text):
+        tokens = self.tokenizer.encode(
+            self.start_token + text + self.end_token)
+        tokenLength = len(tokens)
+        remain = self.max_length - tokenLength
+        if remain >= 0:
+            tokens = tokens + [self.tokenizer.pad_token_id] * remain
+            attention_mask = [1] * tokenLength + [0] * remain
+        else:
+            tokens = tokens[: self.max_length - 1] + \
+                self.tokenizer.encode(self.end_token)
+            attention_mask = [1] * self.max_length
+        return tokens, attention_mask
+    def __getitem__(self, index):
+        record = self.data.iloc[index]
+        question, answer = record['input_text'], record['target_text']
+        input_id, input_mask = self.create_tokens(question)
+        output_id, output_mask = self.create_tokens(answer)
+        label = output_id[1:(self.max_length + 1)]
+        label = label + (self.max_length - len(label)) * [-100]
+        return {
+            'input_ids': torch.LongTensor(input_id),
+            'attention_mask': torch.LongTensor(input_mask),
+            'decoder_input_ids': torch.LongTensor(output_id),
+            'decoder_attention_mask': torch.LongTensor(output_mask),
+            "labels": torch.LongTensor(label)
+        }
+class OneSourceDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        **kwargs
+    ):
+        super().__init__()
+        self.data = kwargs.get('data')
+        self.dataset_args = kwargs.get("dataset_args")
+        self.batch_size = kwargs.get("batch_size") or 32
+        self.train_size = kwargs.get("train_size") or 0.9
+    def setup(self, stage=""):
+        # trainset, testset = train_test_split(df_train, train_size=self.train_size, shuffle=True)
+        self.trainset = DatasetFromDataframe(df_train, self.dataset_args)
+        self.testset = DatasetFromDataframe(df_train, self.dataset_args)
+    def train_dataloader(self):
+        train = DataLoader(
+            self.trainset,
+            batch_size=self.batch_size
+        )
+        return train
+    def val_dataloader(self):
+        val = DataLoader(
+            self.testset,
+            batch_size=self.batch_size
+        )
+        return val
+    def test_dataloader(self):
+        test = DataLoader(
+            self.testset,
+            batch_size=self.batch_size
+        )
+        return test
+class KoBARTConditionalGeneration(pl.LightningModule):
+    def __init__(self, hparams, **kwargs):
+        super(KoBARTConditionalGeneration, self).__init__()
+        self.hparams.update(hparams)
+        self.model = kwargs['model']
+        self.tokenizer = kwargs['tokenizer']
+        self.model.train()
+    def configure_optimizers(self):
+        param_optimizer = list(self.model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [{
+            'params': [
+                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
+            ],
+            'weight_decay': 0.01
+        }, {
+            'params': [
+                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
+            ],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(
+            optimizer_grouped_parameters,
+            lr=self.hparams.lr
+        )
+        # num_workers = gpus * num_nodes
+        data_len = len(self.train_dataloader().dataset)
+        print(f'학습 데이터 양: {data_len}')
+        num_train_steps = int(
+            data_len / self.hparams.batch_size * self.hparams.max_epochs)
+        print(f'Step 수: {num_train_steps}')
+        num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
+        print(f'Warmup Step 수: {num_warmup_steps}')
+        scheduler = get_cosine_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_train_steps
+        )
+        lr_scheduler = {
+            'scheduler': scheduler,
+            'monitor': 'loss',
+            'interval': 'step',
+            'frequency': 1
+        }
+        return [optimizer], [lr_scheduler]
+    def forward(self, inputs):
+        return self.model(
+            input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask'],
+            decoder_input_ids=inputs['decoder_input_ids'],
+            decoder_attention_mask=inputs['decoder_attention_mask'],
+            labels=inputs['labels'],
+            return_dict=True
+        )
+    def training_step(self, batch, batch_idx):
+        loss = self(batch).loss
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self(batch).loss
+    def test(self, text):
+        tokens = self.tokenizer.encode("<s>" + text + "</s>")
+        tokenLength = len(tokens)
+        remain = self.hparams.max_length - tokenLength
+        if remain >= 0:
+            tokens = tokens + [self.tokenizer.pad_token_id] * remain
+            attention_mask = [1] * tokenLength + [0] * remain
+        else:
+            tokens = tokens[: self.hparams.max_length - 1] + \
+                self.tokenizer.encode("</s>")
+            attention_mask = [1] * self.hparams.max_length
+        tokens = torch.LongTensor([tokens])
+        attention_mask = torch.LongTensor([attention_mask])
+        self.model = self.model
+        result = self.model.generate(
+            tokens,
+            max_length=self.hparams.max_length,
+            attention_mask=attention_mask,
+            num_beams=10
+        )[0]
+        a = self.tokenizer.decode(result)
+        return a
+def generation(szContent):
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(
+        "gogamza/kobart-summarization")
+    model1 = BartForConditionalGeneration.from_pretrained(
+        "gogamza/kobart-summarization")
+    if len(szContent) > 500:
+        input_ids = tokenizer.encode(szContent[:500], return_tensors="pt")
+    else:
+        input_ids = tokenizer.encode(szContent, return_tensors="pt")
+    summary = model1.generate(
+        input_ids=input_ids,
+        bos_token_id=model1.config.bos_token_id,
+        eos_token_id=model1.config.eos_token_id,
+        length_penalty=.3,  # bigger than 1= longer, smaller than 1=shorter summary
+        max_length=35,
+        min_length=25,
+        num_beams=5)
+    szSummary = tokenizer.decode(summary[0], skip_special_tokens=True)
+    print(szSummary)
+    KoBARTModel = BartForConditionalGeneration.from_pretrained(
+        './model/final2.h5')
+    BATCH_SIZE = 32
+    MAX_LENGTH = 128
+    EPOCHS = 0
+    model2 = KoBARTConditionalGeneration({
+        "lr": 5e-6,
+        "warmup_ratio": 0.1,
+        "batch_size": BATCH_SIZE,
+        "max_length": MAX_LENGTH,
+        "max_epochs": EPOCHS
+    },
+        tokenizer=tokenizer,
+        model=KoBARTModel
+    )
+    dm = OneSourceDataModule(
+        data=df_train,
+        batch_size=BATCH_SIZE,
+        train_size=0.9,
+        dataset_args={
+            "tokenizer": tokenizer,
+            "max_length": MAX_LENGTH,
+        }
+    )
+    trainer = pl.Trainer(
+        max_epochs=EPOCHS,
+        gpus=0
+    )
+    trainer.fit(model2, dm)
+    szTitle = model2.test(szSummary)
+    df = pd.DataFrame()
+    df['newTitle'] = [szTitle]
+    df['content'] = [szContent]
+    # White space, punctuation removal
+    pattern_whitespace = re.compile(f'[{whitespace}]+')
+    df['newTitle'] = df.newTitle.fillna('').replace(pattern_whitespace, ' ').map(
+        lambda x: unicodedata.normalize('NFC', x)).str.strip()
+    df['newTitle'] = df.newTitle.map(CleanEnd_g)
+    df['newTitle'] = df.newTitle.map(TextFilter)
+    return df.newTitle[0]
+def new_headline(title, content):
+    label = is_clickbait(title, content)
+    if label[0] == 0:
+        return generation(content)
+    elif label[0] == 1:
+        return '낚시성 기사가 아닙니다.'
+# gradio
+with gr.Blocks() as demo1:
+    gr.Markdown(
+        """
+    <h1 align="center">
+    clickbait news classifier and new headline generator
+    </h1>
+    """)
+    gr.Markdown(
+        """
+    뉴스 기사 제목과 본문을 입력하면 낚시성 기사인지 분류하고,
+    낚시성 기사이면 새로운 제목을 생성해주는 프로그램입니다.
+    """)
+    with gr.Row():
+        with gr.Column():
+            inputs = [gr.Textbox(placeholder="뉴스기사 제목을 입력해주세요", label='headline'),
+                      gr.Textbox(
+                lines=10, placeholder="뉴스기사 본문을 입력해주세요", label='content')]
+            with gr.Row():
+                btn = gr.Button("결과 출력")
+        with gr.Column():
+            output = gr.Text(label='Result')
+    btn.click(fn=new_headline, inputs=inputs, outputs=output)
+if __name__ == "__main__":
+    demo1.launch()

model/final2.h5/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_name_or_path": "/content/drive/My Drive/23 U 4-1/\ud14d\uc2a4\ud2b8\ub9c8\uc774\ub2dd/\uae30\ub9d0\ud504\ub85c\uc81d\ud2b8/final2.h5",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "author": "Heewon Jeon(madjakarta@gmail.com)",
+  "bos_token_id": 1,
+  "classif_dropout": 0.1,
+  "classifier_dropout": 0.1,
+  "d_model": 768,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 1,
+  "do_blenderbot_90_layernorm": false,
+  "dropout": 0.1,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 1,
+  "extra_pos_embeddings": 2,
+  "force_bos_token_to_be_generated": false,
+  "forced_eos_token_id": 1,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "NEGATIVE",
+    "1": "POSITIVE"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "kobart_version": 2.0,
+  "label2id": {
+    "NEGATIVE": 0,
+    "POSITIVE": 1
+  },
+  "max_position_embeddings": 1026,
+  "model_type": "bart",
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "num_hidden_layers": 6,
+  "pad_token_id": 3,
+  "scale_embedding": false,
+  "static_position_embeddings": false,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.1",
+  "use_cache": true,
+  "vocab_size": 30000
+}

model/final2.h5/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 1,
+  "eos_token_id": 1,
+  "forced_eos_token_id": 1,
+  "pad_token_id": 3,
+  "transformers_version": "4.30.1"
+}

model/final2.h5/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf59473330d28a08bc91af6a2aadca7ffdfc67aabe5af8a0e337532744d491dd
+size 495644701

model/onlineContrastive/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

model/onlineContrastive/README.md ADDED Viewed

	@@ -0,0 +1,126 @@

+---
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+---
+# {MODEL_NAME}
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+<!--- Describe your model here -->
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('{MODEL_NAME}')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
+model = AutoModel.from_pretrained('{MODEL_NAME}')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling. In this case, mean pooling.
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+## Evaluation Results
+<!--- Describe how your model was evaluated -->
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
+## Training
+The model was trained with the parameters:
+**DataLoader**:
+`torch.utils.data.dataloader.DataLoader` of length 1822 with parameters:
+```
+{'batch_size': 128, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
+```
+**Loss**:
+`sentence_transformers.losses.OnlineContrastiveLoss.OnlineContrastiveLoss`
+Parameters of the fit()-Method:
+```
+{
+    "epochs": 5,
+    "evaluation_steps": 182,
+    "evaluator": "sentence_transformers.evaluation.BinaryClassificationEvaluator.BinaryClassificationEvaluator",
+    "max_grad_norm": 1,
+    "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
+    "optimizer_params": {
+        "lr": 2e-05
+    },
+    "scheduler": "WarmupLinear",
+    "steps_per_epoch": null,
+    "warmup_steps": 911,
+    "weight_decay": 0.01
+}
+```
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 128, 'do_lower_case': True}) with Transformer model: RobertaModel
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
+)
+```
+## Citing & Authors
+<!--- Describe where people can find more information -->

model/onlineContrastive/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_name_or_path": "klue/roberta-base",
+  "architectures": [
+    "RobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "tokenizer_class": "BertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.2",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 32000
+}

model/onlineContrastive/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.29.2",
+    "pytorch": "2.0.1+cu118"
+  }
+}

model/onlineContrastive/modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

model/onlineContrastive/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f33199a31e10b0c6bf79b4b624ad62a9759e9684df10242be30e675f1c6967e
+size 442543661

model/onlineContrastive/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 128,
+  "do_lower_case": true
+}

model/onlineContrastive/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

model/onlineContrastive/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/onlineContrastive/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

model/onlineContrastive/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy
+pandas
+konlpy
+sentence_transformers
+transformers
+pytorch_lightning==1.4.9
+torchmetrics==0.6.0
+torchtext==0.6.0
+transformers[sentencepiece]
+torch
+tensorflow