Łukasz Augustyniak commited on
Commit
2c5669c
1 Parent(s): 6245c65

first version of clarin poslish roberta based on kgr10 dataset

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Datasource local storage ignored files
5
+ /../../../../../:\github\clarin\roberta-polish-v1\.idea/dataSources/
6
+ /dataSources.local.xml
7
+ # Editor-based HTTP Client requests
8
+ /httpRequests/
9
+ # Zeppelin ignored files
10
+ /ZeppelinRemoteNotebooks/
.idea/deployment.xml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
4
+ <serverData>
5
+ <paths name="4320">
6
+ <serverdata>
7
+ <mappings>
8
+ <mapping local="$PROJECT_DIR$" web="/" />
9
+ </mappings>
10
+ </serverdata>
11
+ </paths>
12
+ <paths name="aci-user@jenkins-devbox-feature-ACI-ML-4468.spokenlabs.pl:22">
13
+ <serverdata>
14
+ <mappings>
15
+ <mapping local="$PROJECT_DIR$" web="/" />
16
+ </mappings>
17
+ </serverdata>
18
+ </paths>
19
+ <paths name="aci-user@jenkins-devbox-feature-aci-ml-3506-test.spokenlabs.pl:22">
20
+ <serverdata>
21
+ <mappings>
22
+ <mapping local="$PROJECT_DIR$" web="/" />
23
+ </mappings>
24
+ </serverdata>
25
+ </paths>
26
+ <paths name="aci-user@jenkins-devbox-feature-aci-ml-3506.spokenlabs.pl:22">
27
+ <serverdata>
28
+ <mappings>
29
+ <mapping local="$PROJECT_DIR$" web="/" />
30
+ </mappings>
31
+ </serverdata>
32
+ </paths>
33
+ <paths name="aci-user@jenkins-devbox-feature-aci-ml-4235.spokenlabs.pl:22">
34
+ <serverdata>
35
+ <mappings>
36
+ <mapping local="$PROJECT_DIR$" web="/" />
37
+ </mappings>
38
+ </serverdata>
39
+ </paths>
40
+ <paths name="aci-user@jenkins-devbox-feature-aci-ml-4320.spokenlabs.pl:22">
41
+ <serverdata>
42
+ <mappings>
43
+ <mapping local="$PROJECT_DIR$" web="/" />
44
+ </mappings>
45
+ </serverdata>
46
+ </paths>
47
+ <paths name="aci-user@jenkins-devbox-feature-aci-ml-4329.spokenlabs.pl:22">
48
+ <serverdata>
49
+ <mappings>
50
+ <mapping local="$PROJECT_DIR$" web="/" />
51
+ </mappings>
52
+ </serverdata>
53
+ </paths>
54
+ <paths name="aci-user@jenkins-devbox-feature-laugustyniak-2.spokenlabs.pl:22">
55
+ <serverdata>
56
+ <mappings>
57
+ <mapping local="$PROJECT_DIR$" web="/" />
58
+ </mappings>
59
+ </serverdata>
60
+ </paths>
61
+ <paths name="aci-user@jenkins-devbox-feature-laugustyniak.spokenlabs.pl:22">
62
+ <serverdata>
63
+ <mappings>
64
+ <mapping local="$PROJECT_DIR$" web="/" />
65
+ </mappings>
66
+ </serverdata>
67
+ </paths>
68
+ <paths name="aci-user@jenkins-devbox-niedakh2.spokenlabs.pl:22">
69
+ <serverdata>
70
+ <mappings>
71
+ <mapping local="$PROJECT_DIR$" web="/" />
72
+ </mappings>
73
+ </serverdata>
74
+ </paths>
75
+ <paths name="aihub">
76
+ <serverdata>
77
+ <mappings>
78
+ <mapping local="$PROJECT_DIR$" web="/" />
79
+ </mappings>
80
+ </serverdata>
81
+ </paths>
82
+ <paths name="giustu@192.168.1.45:22">
83
+ <serverdata>
84
+ <mappings>
85
+ <mapping local="$PROJECT_DIR$" web="/" />
86
+ </mappings>
87
+ </serverdata>
88
+ </paths>
89
+ <paths name="laugustyniak@oxygen.engine.kdm.wcss.pl:22 key">
90
+ <serverdata>
91
+ <mappings>
92
+ <mapping local="$PROJECT_DIR$" web="/" />
93
+ </mappings>
94
+ </serverdata>
95
+ </paths>
96
+ <paths name="laugustyniak@thorium.engine.kdm.wcss.pl:22">
97
+ <serverdata>
98
+ <mappings>
99
+ <mapping local="$PROJECT_DIR$" web="/" />
100
+ </mappings>
101
+ </serverdata>
102
+ </paths>
103
+ <paths name="laugustyniak@thorium.engine.kdm.wcss.pl:22 key">
104
+ <serverdata>
105
+ <mappings>
106
+ <mapping local="$PROJECT_DIR$" web="/" />
107
+ </mappings>
108
+ </serverdata>
109
+ </paths>
110
+ </serverData>
111
+ </component>
112
+ </project>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
5
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
6
+ <option name="ignoredPackages">
7
+ <value>
8
+ <list size="52">
9
+ <item index="0" class="java.lang.String" itemvalue="scipy" />
10
+ <item index="1" class="java.lang.String" itemvalue="tornado" />
11
+ <item index="2" class="java.lang.String" itemvalue="joblib" />
12
+ <item index="3" class="java.lang.String" itemvalue="scikit-learn" />
13
+ <item index="4" class="java.lang.String" itemvalue="nltk" />
14
+ <item index="5" class="java.lang.String" itemvalue="more-itertools" />
15
+ <item index="6" class="java.lang.String" itemvalue="streamlit" />
16
+ <item index="7" class="java.lang.String" itemvalue="numpy" />
17
+ <item index="8" class="java.lang.String" itemvalue="requests" />
18
+ <item index="9" class="java.lang.String" itemvalue="srsly" />
19
+ <item index="10" class="java.lang.String" itemvalue="pandas" />
20
+ <item index="11" class="java.lang.String" itemvalue="tqdm" />
21
+ <item index="12" class="java.lang.String" itemvalue="fastapi" />
22
+ <item index="13" class="java.lang.String" itemvalue="spacy" />
23
+ <item index="14" class="java.lang.String" itemvalue="seaborn" />
24
+ <item index="15" class="java.lang.String" itemvalue="matplotlib" />
25
+ <item index="16" class="java.lang.String" itemvalue="bokeh" />
26
+ <item index="17" class="java.lang.String" itemvalue="uvicorn" />
27
+ <item index="18" class="java.lang.String" itemvalue="xlrd" />
28
+ <item index="19" class="java.lang.String" itemvalue="gensim" />
29
+ <item index="20" class="java.lang.String" itemvalue="protobuf" />
30
+ <item index="21" class="java.lang.String" itemvalue="py2cytoscape" />
31
+ <item index="22" class="java.lang.String" itemvalue="pydot" />
32
+ <item index="23" class="java.lang.String" itemvalue="jellyfish" />
33
+ <item index="24" class="java.lang.String" itemvalue="visJS2jupyter" />
34
+ <item index="25" class="java.lang.String" itemvalue="preshed" />
35
+ <item index="26" class="java.lang.String" itemvalue="ntwulf" />
36
+ <item index="27" class="java.lang.String" itemvalue="tensorflow" />
37
+ <item index="28" class="java.lang.String" itemvalue="jupyter" />
38
+ <item index="29" class="java.lang.String" itemvalue="pyvis" />
39
+ <item index="30" class="java.lang.String" itemvalue="summa" />
40
+ <item index="31" class="java.lang.String" itemvalue="sentencepiece" />
41
+ <item index="32" class="java.lang.String" itemvalue="smart-open" />
42
+ <item index="33" class="java.lang.String" itemvalue="Flask" />
43
+ <item index="34" class="java.lang.String" itemvalue="python-rake" />
44
+ <item index="35" class="java.lang.String" itemvalue="pytest" />
45
+ <item index="36" class="java.lang.String" itemvalue="backoff" />
46
+ <item index="37" class="java.lang.String" itemvalue="PyHamcrest" />
47
+ <item index="38" class="java.lang.String" itemvalue="plac" />
48
+ <item index="39" class="java.lang.String" itemvalue="plotly" />
49
+ <item index="40" class="java.lang.String" itemvalue="altair" />
50
+ <item index="41" class="java.lang.String" itemvalue="tensorboard" />
51
+ <item index="42" class="java.lang.String" itemvalue="toolz" />
52
+ <item index="43" class="java.lang.String" itemvalue="murmurhash" />
53
+ <item index="44" class="java.lang.String" itemvalue="textblob" />
54
+ <item index="45" class="java.lang.String" itemvalue="attrs" />
55
+ <item index="46" class="java.lang.String" itemvalue="transformers" />
56
+ <item index="47" class="java.lang.String" itemvalue="torch-1.6.0" />
57
+ <item index="48" class="java.lang.String" itemvalue="networkx" />
58
+ <item index="49" class="java.lang.String" itemvalue="mlflow" />
59
+ <item index="50" class="java.lang.String" itemvalue="pygraphviz" />
60
+ <item index="51" class="java.lang.String" itemvalue="wordcloud" />
61
+ </list>
62
+ </value>
63
+ </option>
64
+ </inspection_tool>
65
+ <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
66
+ <option name="ignoredErrors">
67
+ <list>
68
+ <option value="E402" />
69
+ </list>
70
+ </option>
71
+ </inspection_tool>
72
+ <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
73
+ <option name="ignoredErrors">
74
+ <list>
75
+ <option value="N801" />
76
+ </list>
77
+ </option>
78
+ </inspection_tool>
79
+ <inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
80
+ <option name="ignoredPackages">
81
+ <list>
82
+ <option value="pyspark-stubs==3.0.0.post1" />
83
+ </list>
84
+ </option>
85
+ </inspection_tool>
86
+ <inspection_tool class="TsLint" enabled="true" level="WARNING" enabled_by_default="true" />
87
+ </profile>
88
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/roberta-polish-v1.iml" filepath="$PROJECT_DIR$/.idea/roberta-polish-v1.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/roberta-polish-v1.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Polish RoBERTa
2
+
3
+ The model pre-trained on KGR10 corpora.
4
+
5
+ More about model at [CLARIN-dspace](https://huggingface.co/clarin/roberta-polish-v1)
6
+
7
+ ## Usage
8
+
9
+ ## Huggingface model hub
10
+
11
+ ## Acknowledgments
12
+
13
+ [CLARIN-PL and CLARIN-BIZ project](https://clarin-pl.eu/)
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "relative_key_query",
21
+ "type_vocab_size": 1,
22
+ "vocab_size": 52000
23
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e23207c71c1f2a059cd1a793100ad09635b346336df7c77bfd41aa71f4cb9350
3
+ size 1014694460
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b60322deba22185b84526317ee882cbebe8bcef59004247d8eb167c1777d23
3
+ size 507376066
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d0a696788deedd759a347b260c92c0c4099e4a6ec6763dce132057154354c93
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/home/clarin/workspace/tokenizer"}
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daabfc4379e9b766a7342f91f3d30f870a9eaa6fa302fd58371e3465f3cdea3f
3
+ size 1903
vocab.json ADDED
The diff for this file is too large to render. See raw diff