Łukasz Augustyniak
commited on
Commit
•
2c5669c
1
Parent(s):
6245c65
first version of clarin poslish roberta based on kgr10 dataset
Browse files- .idea/.gitignore +10 -0
- .idea/deployment.xml +112 -0
- .idea/inspectionProfiles/Project_Default.xml +88 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/roberta-polish-v1.iml +8 -0
- .idea/vcs.xml +6 -0
- README.md +13 -0
- config.json +23 -0
- merges.txt +0 -0
- optimizer.pt +3 -0
- pytorch_model.bin +3 -0
- scheduler.pt +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
- vocab.json +0 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Datasource local storage ignored files
|
5 |
+
/../../../../../:\github\clarin\roberta-polish-v1\.idea/dataSources/
|
6 |
+
/dataSources.local.xml
|
7 |
+
# Editor-based HTTP Client requests
|
8 |
+
/httpRequests/
|
9 |
+
# Zeppelin ignored files
|
10 |
+
/ZeppelinRemoteNotebooks/
|
.idea/deployment.xml
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
|
4 |
+
<serverData>
|
5 |
+
<paths name="4320">
|
6 |
+
<serverdata>
|
7 |
+
<mappings>
|
8 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
9 |
+
</mappings>
|
10 |
+
</serverdata>
|
11 |
+
</paths>
|
12 |
+
<paths name="aci-user@jenkins-devbox-feature-ACI-ML-4468.spokenlabs.pl:22">
|
13 |
+
<serverdata>
|
14 |
+
<mappings>
|
15 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
16 |
+
</mappings>
|
17 |
+
</serverdata>
|
18 |
+
</paths>
|
19 |
+
<paths name="aci-user@jenkins-devbox-feature-aci-ml-3506-test.spokenlabs.pl:22">
|
20 |
+
<serverdata>
|
21 |
+
<mappings>
|
22 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
23 |
+
</mappings>
|
24 |
+
</serverdata>
|
25 |
+
</paths>
|
26 |
+
<paths name="aci-user@jenkins-devbox-feature-aci-ml-3506.spokenlabs.pl:22">
|
27 |
+
<serverdata>
|
28 |
+
<mappings>
|
29 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
30 |
+
</mappings>
|
31 |
+
</serverdata>
|
32 |
+
</paths>
|
33 |
+
<paths name="aci-user@jenkins-devbox-feature-aci-ml-4235.spokenlabs.pl:22">
|
34 |
+
<serverdata>
|
35 |
+
<mappings>
|
36 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
37 |
+
</mappings>
|
38 |
+
</serverdata>
|
39 |
+
</paths>
|
40 |
+
<paths name="aci-user@jenkins-devbox-feature-aci-ml-4320.spokenlabs.pl:22">
|
41 |
+
<serverdata>
|
42 |
+
<mappings>
|
43 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
44 |
+
</mappings>
|
45 |
+
</serverdata>
|
46 |
+
</paths>
|
47 |
+
<paths name="aci-user@jenkins-devbox-feature-aci-ml-4329.spokenlabs.pl:22">
|
48 |
+
<serverdata>
|
49 |
+
<mappings>
|
50 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
51 |
+
</mappings>
|
52 |
+
</serverdata>
|
53 |
+
</paths>
|
54 |
+
<paths name="aci-user@jenkins-devbox-feature-laugustyniak-2.spokenlabs.pl:22">
|
55 |
+
<serverdata>
|
56 |
+
<mappings>
|
57 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
58 |
+
</mappings>
|
59 |
+
</serverdata>
|
60 |
+
</paths>
|
61 |
+
<paths name="aci-user@jenkins-devbox-feature-laugustyniak.spokenlabs.pl:22">
|
62 |
+
<serverdata>
|
63 |
+
<mappings>
|
64 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
65 |
+
</mappings>
|
66 |
+
</serverdata>
|
67 |
+
</paths>
|
68 |
+
<paths name="aci-user@jenkins-devbox-niedakh2.spokenlabs.pl:22">
|
69 |
+
<serverdata>
|
70 |
+
<mappings>
|
71 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
72 |
+
</mappings>
|
73 |
+
</serverdata>
|
74 |
+
</paths>
|
75 |
+
<paths name="aihub">
|
76 |
+
<serverdata>
|
77 |
+
<mappings>
|
78 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
79 |
+
</mappings>
|
80 |
+
</serverdata>
|
81 |
+
</paths>
|
82 |
+
<paths name="giustu@192.168.1.45:22">
|
83 |
+
<serverdata>
|
84 |
+
<mappings>
|
85 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
86 |
+
</mappings>
|
87 |
+
</serverdata>
|
88 |
+
</paths>
|
89 |
+
<paths name="laugustyniak@oxygen.engine.kdm.wcss.pl:22 key">
|
90 |
+
<serverdata>
|
91 |
+
<mappings>
|
92 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
93 |
+
</mappings>
|
94 |
+
</serverdata>
|
95 |
+
</paths>
|
96 |
+
<paths name="laugustyniak@thorium.engine.kdm.wcss.pl:22">
|
97 |
+
<serverdata>
|
98 |
+
<mappings>
|
99 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
100 |
+
</mappings>
|
101 |
+
</serverdata>
|
102 |
+
</paths>
|
103 |
+
<paths name="laugustyniak@thorium.engine.kdm.wcss.pl:22 key">
|
104 |
+
<serverdata>
|
105 |
+
<mappings>
|
106 |
+
<mapping local="$PROJECT_DIR$" web="/" />
|
107 |
+
</mappings>
|
108 |
+
</serverdata>
|
109 |
+
</paths>
|
110 |
+
</serverData>
|
111 |
+
</component>
|
112 |
+
</project>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
|
5 |
+
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
6 |
+
<option name="ignoredPackages">
|
7 |
+
<value>
|
8 |
+
<list size="52">
|
9 |
+
<item index="0" class="java.lang.String" itemvalue="scipy" />
|
10 |
+
<item index="1" class="java.lang.String" itemvalue="tornado" />
|
11 |
+
<item index="2" class="java.lang.String" itemvalue="joblib" />
|
12 |
+
<item index="3" class="java.lang.String" itemvalue="scikit-learn" />
|
13 |
+
<item index="4" class="java.lang.String" itemvalue="nltk" />
|
14 |
+
<item index="5" class="java.lang.String" itemvalue="more-itertools" />
|
15 |
+
<item index="6" class="java.lang.String" itemvalue="streamlit" />
|
16 |
+
<item index="7" class="java.lang.String" itemvalue="numpy" />
|
17 |
+
<item index="8" class="java.lang.String" itemvalue="requests" />
|
18 |
+
<item index="9" class="java.lang.String" itemvalue="srsly" />
|
19 |
+
<item index="10" class="java.lang.String" itemvalue="pandas" />
|
20 |
+
<item index="11" class="java.lang.String" itemvalue="tqdm" />
|
21 |
+
<item index="12" class="java.lang.String" itemvalue="fastapi" />
|
22 |
+
<item index="13" class="java.lang.String" itemvalue="spacy" />
|
23 |
+
<item index="14" class="java.lang.String" itemvalue="seaborn" />
|
24 |
+
<item index="15" class="java.lang.String" itemvalue="matplotlib" />
|
25 |
+
<item index="16" class="java.lang.String" itemvalue="bokeh" />
|
26 |
+
<item index="17" class="java.lang.String" itemvalue="uvicorn" />
|
27 |
+
<item index="18" class="java.lang.String" itemvalue="xlrd" />
|
28 |
+
<item index="19" class="java.lang.String" itemvalue="gensim" />
|
29 |
+
<item index="20" class="java.lang.String" itemvalue="protobuf" />
|
30 |
+
<item index="21" class="java.lang.String" itemvalue="py2cytoscape" />
|
31 |
+
<item index="22" class="java.lang.String" itemvalue="pydot" />
|
32 |
+
<item index="23" class="java.lang.String" itemvalue="jellyfish" />
|
33 |
+
<item index="24" class="java.lang.String" itemvalue="visJS2jupyter" />
|
34 |
+
<item index="25" class="java.lang.String" itemvalue="preshed" />
|
35 |
+
<item index="26" class="java.lang.String" itemvalue="ntwulf" />
|
36 |
+
<item index="27" class="java.lang.String" itemvalue="tensorflow" />
|
37 |
+
<item index="28" class="java.lang.String" itemvalue="jupyter" />
|
38 |
+
<item index="29" class="java.lang.String" itemvalue="pyvis" />
|
39 |
+
<item index="30" class="java.lang.String" itemvalue="summa" />
|
40 |
+
<item index="31" class="java.lang.String" itemvalue="sentencepiece" />
|
41 |
+
<item index="32" class="java.lang.String" itemvalue="smart-open" />
|
42 |
+
<item index="33" class="java.lang.String" itemvalue="Flask" />
|
43 |
+
<item index="34" class="java.lang.String" itemvalue="python-rake" />
|
44 |
+
<item index="35" class="java.lang.String" itemvalue="pytest" />
|
45 |
+
<item index="36" class="java.lang.String" itemvalue="backoff" />
|
46 |
+
<item index="37" class="java.lang.String" itemvalue="PyHamcrest" />
|
47 |
+
<item index="38" class="java.lang.String" itemvalue="plac" />
|
48 |
+
<item index="39" class="java.lang.String" itemvalue="plotly" />
|
49 |
+
<item index="40" class="java.lang.String" itemvalue="altair" />
|
50 |
+
<item index="41" class="java.lang.String" itemvalue="tensorboard" />
|
51 |
+
<item index="42" class="java.lang.String" itemvalue="toolz" />
|
52 |
+
<item index="43" class="java.lang.String" itemvalue="murmurhash" />
|
53 |
+
<item index="44" class="java.lang.String" itemvalue="textblob" />
|
54 |
+
<item index="45" class="java.lang.String" itemvalue="attrs" />
|
55 |
+
<item index="46" class="java.lang.String" itemvalue="transformers" />
|
56 |
+
<item index="47" class="java.lang.String" itemvalue="torch-1.6.0" />
|
57 |
+
<item index="48" class="java.lang.String" itemvalue="networkx" />
|
58 |
+
<item index="49" class="java.lang.String" itemvalue="mlflow" />
|
59 |
+
<item index="50" class="java.lang.String" itemvalue="pygraphviz" />
|
60 |
+
<item index="51" class="java.lang.String" itemvalue="wordcloud" />
|
61 |
+
</list>
|
62 |
+
</value>
|
63 |
+
</option>
|
64 |
+
</inspection_tool>
|
65 |
+
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
66 |
+
<option name="ignoredErrors">
|
67 |
+
<list>
|
68 |
+
<option value="E402" />
|
69 |
+
</list>
|
70 |
+
</option>
|
71 |
+
</inspection_tool>
|
72 |
+
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
73 |
+
<option name="ignoredErrors">
|
74 |
+
<list>
|
75 |
+
<option value="N801" />
|
76 |
+
</list>
|
77 |
+
</option>
|
78 |
+
</inspection_tool>
|
79 |
+
<inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
|
80 |
+
<option name="ignoredPackages">
|
81 |
+
<list>
|
82 |
+
<option value="pyspark-stubs==3.0.0.post1" />
|
83 |
+
</list>
|
84 |
+
</option>
|
85 |
+
</inspection_tool>
|
86 |
+
<inspection_tool class="TsLint" enabled="true" level="WARNING" enabled_by_default="true" />
|
87 |
+
</profile>
|
88 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/roberta-polish-v1.iml" filepath="$PROJECT_DIR$/.idea/roberta-polish-v1.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/roberta-polish-v1.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Polish RoBERTa
|
2 |
+
|
3 |
+
The model pre-trained on KGR10 corpora.
|
4 |
+
|
5 |
+
More about model at [CLARIN-dspace](https://huggingface.co/clarin/roberta-polish-v1)
|
6 |
+
|
7 |
+
## Usage
|
8 |
+
|
9 |
+
## Huggingface model hub
|
10 |
+
|
11 |
+
## Acknowledgments
|
12 |
+
|
13 |
+
[CLARIN-PL and CLARIN-BIZ project](https://clarin-pl.eu/)
|
config.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "relative_key_query",
|
21 |
+
"type_vocab_size": 1,
|
22 |
+
"vocab_size": 52000
|
23 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e23207c71c1f2a059cd1a793100ad09635b346336df7c77bfd41aa71f4cb9350
|
3 |
+
size 1014694460
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8b60322deba22185b84526317ee882cbebe8bcef59004247d8eb167c1777d23
|
3 |
+
size 507376066
|
scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d0a696788deedd759a347b260c92c0c4099e4a6ec6763dce132057154354c93
|
3 |
+
size 623
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/home/clarin/workspace/tokenizer"}
|
trainer_state.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:daabfc4379e9b766a7342f91f3d30f870a9eaa6fa302fd58371e3465f3cdea3f
|
3 |
+
size 1903
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|