README.md · guialfaro/korean-paraphrasing at 4ce55a4bac22fa0dd59c94d154954d532a02c31f

metadata

license: mit

Korean BART model for finetuning task. The dataset utilized can be found on the Files and versions tab under the name dataset.csv.

import torch
from transformers import BartForConditionalGeneration, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BartForConditionalGeneration.from_pretrained('guialfaro/korean-paraphrasing').to(device)
tokenizer = AutoTokenizer.from_pretrained('guialfaro/korean-paraphrasing')

sentence = "7층 방문을 위해 방문록 작성이 필요합니다."
text =  f"paraphrase: {sentence} "

encoding = tokenizer.batch_encode_plus(
            [text],
            max_length=256,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",)

source_ids = encoding["input_ids"].to(device, dtype=torch.long)
source_mask = encoding["attention_mask"].to(device, dtype=torch.long)

generated_ids = model.generate(
                input_ids=source_ids,
                attention_mask=source_mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True)

preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

print(f"Original Sentence :: {sentence}")
print(f"Paraphrased Sentence :: {preds[0]}")