README new sections
Browse files
README.md
CHANGED
@@ -28,22 +28,22 @@ tokenizer = MBartTokenizer.from_pretrained(model_name)
|
|
28 |
model = MBartForConditionalGeneration.from_pretrained(model_name)
|
29 |
|
30 |
input_ids = tokenizer.prepare_seq2seq_batch(
|
31 |
-
[
|
32 |
-
src_lang="en_XX",
|
33 |
return_tensors="pt",
|
34 |
padding="max_length",
|
35 |
truncation=True,
|
36 |
max_length=600
|
37 |
-
)["input_ids"]
|
38 |
|
39 |
output_ids = model.generate(
|
40 |
-
input_ids=input_ids
|
41 |
max_length=162,
|
42 |
no_repeat_ngram_size=3,
|
43 |
num_beams=5,
|
44 |
-
top_k=0
|
45 |
-
decoder_start_token_id=tokenizer.lang_code_to_id["ru_RU"]
|
46 |
)[0]
|
|
|
47 |
summary = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
48 |
print(summary)
|
49 |
```
|
@@ -64,21 +64,92 @@ print(summary)
|
|
64 |
|
65 |
## Eval results
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
### BibTeX entry and citation info
|
69 |
|
70 |
```bibtex
|
71 |
@InProceedings{10.1007/978-3-030-59082-6_9,
|
72 |
-
author="Gusev, Ilya",
|
73 |
-
editor="Filchenkov, Andrey
|
74 |
-
|
75 |
-
and
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
pages="122--134",
|
82 |
-
isbn="978-3-030-59082-6"
|
83 |
}
|
84 |
```
|
|
|
28 |
model = MBartForConditionalGeneration.from_pretrained(model_name)
|
29 |
|
30 |
input_ids = tokenizer.prepare_seq2seq_batch(
|
31 |
+
[article_text],
|
32 |
+
src_lang="en_XX", # fairseq training artifact
|
33 |
return_tensors="pt",
|
34 |
padding="max_length",
|
35 |
truncation=True,
|
36 |
max_length=600
|
37 |
+
)["input_ids"]
|
38 |
|
39 |
output_ids = model.generate(
|
40 |
+
input_ids=input_ids,
|
41 |
max_length=162,
|
42 |
no_repeat_ngram_size=3,
|
43 |
num_beams=5,
|
44 |
+
top_k=0
|
|
|
45 |
)[0]
|
46 |
+
|
47 |
summary = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
48 |
print(summary)
|
49 |
```
|
|
|
64 |
|
65 |
## Eval results
|
66 |
|
67 |
+
| Model | R-1-f | R-2-f | R-L-f | METEOR | BLEU |
|
68 |
+
|:--------------------------|:------|:------|:------|:-------|:-----|
|
69 |
+
| gazeta_mbart | 32.6 | 14.6 | 28.2 | 25.7 | 49.8 |
|
70 |
+
|
71 |
+
Predicting all summaries:
|
72 |
+
```python
|
73 |
+
import json
|
74 |
+
import torch
|
75 |
+
from transformers import MBartTokenizer, MBartForConditionalGeneration
|
76 |
+
|
77 |
+
|
78 |
+
def gen_batch(inputs, batch_size):
|
79 |
+
batch_start = 0
|
80 |
+
while batch_start < len(inputs):
|
81 |
+
yield inputs[batch_start: batch_start + batch_size]
|
82 |
+
batch_start += batch_size
|
83 |
+
|
84 |
+
|
85 |
+
def predict(
|
86 |
+
model_name,
|
87 |
+
test_file,
|
88 |
+
predictions_file,
|
89 |
+
targets_file,
|
90 |
+
max_source_tokens_count=600,
|
91 |
+
max_target_tokens_count=160,
|
92 |
+
use_cuda=True,
|
93 |
+
batch_size=4
|
94 |
+
):
|
95 |
+
inputs = []
|
96 |
+
targets = []
|
97 |
+
with open(test_file, "r") as r:
|
98 |
+
for line in r:
|
99 |
+
record = json.loads(line)
|
100 |
+
inputs.append(record["text"])
|
101 |
+
targets.append(record["summary"])
|
102 |
+
|
103 |
+
tokenizer = MBartTokenizer.from_pretrained(model_name)
|
104 |
+
device = torch.device("cuda:0") if use_cuda else torch.device("cpu")
|
105 |
+
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
|
106 |
+
predictions = []
|
107 |
+
for batch in gen_batch(inputs, batch_size):
|
108 |
+
input_ids = tokenizer.prepare_seq2seq_batch(
|
109 |
+
batch,
|
110 |
+
src_lang="en_XX",
|
111 |
+
return_tensors="pt",
|
112 |
+
padding="max_length",
|
113 |
+
truncation=True,
|
114 |
+
max_length=max_source_tokens_count
|
115 |
+
)["input_ids"].to(device)
|
116 |
+
output_ids = model.generate(
|
117 |
+
input_ids=input_ids,
|
118 |
+
max_length=max_target_tokens_count + 2,
|
119 |
+
no_repeat_ngram_size=3,
|
120 |
+
num_beams=5,
|
121 |
+
top_k=0
|
122 |
+
)
|
123 |
+
summaries = tokenizer.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
124 |
+
for s in summaries:
|
125 |
+
print(s)
|
126 |
+
predictions.extend(summaries)
|
127 |
+
with open(predictions_file, "w") as w:
|
128 |
+
for p in predictions:
|
129 |
+
w.write(p.strip() + "\n")
|
130 |
+
with open(targets_file, "w") as w:
|
131 |
+
for t in targets:
|
132 |
+
w.write(t.strip() + "\n")
|
133 |
+
|
134 |
+
predict("IlyaGusev/mbart_ru_sum_gazeta", "gazeta_test.jsonl", "predictions.txt", "targets.txt")
|
135 |
+
```
|
136 |
+
|
137 |
+
Evaluation: https://github.com/IlyaGusev/summarus/blob/master/evaluate.py
|
138 |
+
|
139 |
+
Flags: --language ru --tokenize-after --lower
|
140 |
|
141 |
### BibTeX entry and citation info
|
142 |
|
143 |
```bibtex
|
144 |
@InProceedings{10.1007/978-3-030-59082-6_9,
|
145 |
+
author="Gusev, Ilya",
|
146 |
+
editor="Filchenkov, Andrey and Kauttonen, Janne and Pivovarova, Lidia",
|
147 |
+
title="Dataset for Automatic Summarization of Russian News",
|
148 |
+
booktitle="Artificial Intelligence and Natural Language",
|
149 |
+
year="2020",
|
150 |
+
publisher="Springer International Publishing",
|
151 |
+
address="Cham",
|
152 |
+
pages="122--134",
|
153 |
+
isbn="978-3-030-59082-6"
|
|
|
|
|
154 |
}
|
155 |
```
|