guialfaro commited on
Commit
af58900
1 Parent(s): b8fd596

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +43 -0
README.md CHANGED
@@ -1,3 +1,46 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ [Korean BART](https://huggingface.co/hyunwoongko/kobart) model for finetuning task.
6
+ The dataset utilized can be found on the *Files and versions* tab under the name dataset.csv.
7
+
8
+ ```python
9
+
10
+ import torch
11
+ from transformers import BartForConditionalGeneration, AutoTokenizer
12
+
13
+ model = BartForConditionalGeneration.from_pretrained('guialfaro/korean-paraphrasing')
14
+ tokenizer = AutoTokenizer.from_pretrained('guialfaro/korean-paraphrasing')
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ model = model.to(device)
17
+
18
+ sentence = "7층 방문을 위해 방문록 작성이 필요합니다."
19
+ text = f"paraphrase: {sentence} "
20
+
21
+ encoding = tokenizer.batch_encode_plus(
22
+ [text],
23
+ max_length=256,
24
+ pad_to_max_length=True,
25
+ truncation=True,
26
+ padding="max_length",
27
+ return_tensors="pt",)
28
+
29
+ source_ids = encoding["input_ids"].to(device, dtype=torch.long)
30
+ source_mask = encoding["attention_mask"].to(device, dtype=torch.long)
31
+
32
+ generated_ids = model.generate(
33
+ input_ids=source_ids,
34
+ attention_mask=source_mask,
35
+ max_length=150,
36
+ num_beams=2,
37
+ repetition_penalty=2.5,
38
+ length_penalty=1.0,
39
+ early_stopping=True)
40
+
41
+ preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
42
+
43
+ print (f"Original Sentence :: {sentence}")
44
+ print (f"Paraphrased Sentences :: {preds[0]}")
45
+
46
+ ```