macedonizer
commited on
Commit
•
2953b64
1
Parent(s):
40a41ee
Update README.md
Browse files
README.md
CHANGED
@@ -1,18 +1,14 @@
|
|
1 |
---
|
2 |
language:
|
3 |
- mk
|
4 |
-
thumbnail: https://huggingface.co/macedonizer/
|
5 |
license: Apache 2.0
|
6 |
datasets:
|
7 |
- wiki-mk
|
8 |
-
- time-mk-news-2010-2015
|
9 |
---
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
Pretrained model on English language using a causal language modeling (CLM) objective. It was introduced in
|
14 |
-
[this paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
|
15 |
-
and first released at [this page](https://openai.com/blog/better-language-models/).
|
16 |
|
17 |
## Model description
|
18 |
mk-gpt2 is a transformers model pretrained on a very large corpus of Macedonian data in a self-supervised fashion. This
|
@@ -32,35 +28,12 @@ Here is how to use this model to get the features of a given text in PyTorch:
|
|
32 |
import random
|
33 |
from transformers import AutoTokenizer, AutoModelWithLMHead
|
34 |
|
35 |
-
tokenizer = AutoTokenizer.from_pretrained('macedonizer/
|
36 |
-
model = AutoModelWithLMHead.from_pretrained('macedonizer/mk-gpt2')
|
37 |
|
38 |
-
input_text = '
|
39 |
|
40 |
-
if len(input_text) == 0:
|
41 |
-
encoded_input = tokenizer(input_text, return_tensors="pt") \
|
42 |
-
output = model.generate( \
|
43 |
-
bos_token_id=random.randint(1, 50000), \
|
44 |
-
do_sample=True, \
|
45 |
-
top_k=50, \
|
46 |
-
max_length=1024, \
|
47 |
-
top_p=0.95, \
|
48 |
-
num_return_sequences=1, \
|
49 |
-
) \
|
50 |
-
else: \
|
51 |
-
encoded_input = tokenizer(input_text, return_tensors="pt") \
|
52 |
-
output = model.generate( \
|
53 |
-
**encoded_input, \
|
54 |
-
bos_token_id=random.randint(1, 50000), \
|
55 |
-
do_sample=True, \
|
56 |
-
top_k=50, \
|
57 |
-
max_length=1024, \
|
58 |
-
top_p=0.95, \
|
59 |
-
num_return_sequences=1, \
|
60 |
-
)
|
61 |
|
62 |
-
decoded_output = []
|
63 |
-
for sample in output: \
|
64 |
-
decoded_output.append(tokenizer.decode(sample, skip_special_tokens=True))
|
65 |
|
66 |
print(decoded_output)
|
|
|
1 |
---
|
2 |
language:
|
3 |
- mk
|
4 |
+
thumbnail: https://huggingface.co/macedonizer/blaze-koneski/blaze-koneski.jpg
|
5 |
license: Apache 2.0
|
6 |
datasets:
|
7 |
- wiki-mk
|
|
|
8 |
---
|
9 |
|
10 |
+
# blaze-koneski
|
11 |
+
GPT-2 type of model. We finetuned macedonizer/mk-gpt-2 with Blaze Koneski's poetry.
|
|
|
|
|
|
|
12 |
|
13 |
## Model description
|
14 |
mk-gpt2 is a transformers model pretrained on a very large corpus of Macedonian data in a self-supervised fashion. This
|
|
|
28 |
import random
|
29 |
from transformers import AutoTokenizer, AutoModelWithLMHead
|
30 |
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained('macedonizer/blaze-koneski') \\nmodel = AutoModelWithLMHead.from_pretrained('macedonizer/blaze-koneski')
|
|
|
32 |
|
33 |
+
input_text = 'Моска '
|
34 |
|
35 |
+
if len(input_text) == 0: \\n encoded_input = tokenizer(input_text, return_tensors="pt") \\n output = model.generate( \\n bos_token_id=random.randint(1, 50000), \\n do_sample=True, \\n top_k=50, \\n max_length=1024, \\n top_p=0.95, \\n num_return_sequences=1, \\n ) \\nelse: \\n encoded_input = tokenizer(input_text, return_tensors="pt") \\n output = model.generate( \\n **encoded_input, \\n bos_token_id=random.randint(1, 50000), \\n do_sample=True, \\n top_k=50, \\n max_length=1024, \\n top_p=0.95, \\n num_return_sequences=1, \\n )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
decoded_output = [] \\nfor sample in output: \\n decoded_output.append(tokenizer.decode(sample, skip_special_tokens=True))
|
|
|
|
|
38 |
|
39 |
print(decoded_output)
|