First commit
Browse files- 1_Pooling/config.json +7 -0
- config.json +47 -0
- config_sentence_transformers.json +7 -0
- eval/similarity_evaluation_sts-dev_results.csv +12 -0
- get-pip.py +0 -0
- modules.json +14 -0
- pytorch_model.bin +3 -0
- sentence_bert_config.json +4 -0
- similarity.py +26 -0
- similarity_evaluation_sts-test_results.csv +4 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +16 -0
- vocab.txt +0 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 768,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
+
}
|
config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "indobenchmark/indobert-base-p2",
|
3 |
+
"_num_labels": 5,
|
4 |
+
"architectures": [
|
5 |
+
"BertModel"
|
6 |
+
],
|
7 |
+
"attention_probs_dropout_prob": 0.1,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"directionality": "bidi",
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"id2label": {
|
14 |
+
"0": "LABEL_0",
|
15 |
+
"1": "LABEL_1",
|
16 |
+
"2": "LABEL_2",
|
17 |
+
"3": "LABEL_3",
|
18 |
+
"4": "LABEL_4"
|
19 |
+
},
|
20 |
+
"initializer_range": 0.02,
|
21 |
+
"intermediate_size": 3072,
|
22 |
+
"label2id": {
|
23 |
+
"LABEL_0": 0,
|
24 |
+
"LABEL_1": 1,
|
25 |
+
"LABEL_2": 2,
|
26 |
+
"LABEL_3": 3,
|
27 |
+
"LABEL_4": 4
|
28 |
+
},
|
29 |
+
"layer_norm_eps": 1e-12,
|
30 |
+
"max_position_embeddings": 512,
|
31 |
+
"model_type": "bert",
|
32 |
+
"num_attention_heads": 12,
|
33 |
+
"num_hidden_layers": 12,
|
34 |
+
"output_past": true,
|
35 |
+
"pad_token_id": 0,
|
36 |
+
"pooler_fc_size": 768,
|
37 |
+
"pooler_num_attention_heads": 12,
|
38 |
+
"pooler_num_fc_layers": 3,
|
39 |
+
"pooler_size_per_head": 128,
|
40 |
+
"pooler_type": "first_token_transform",
|
41 |
+
"position_embedding_type": "absolute",
|
42 |
+
"torch_dtype": "float32",
|
43 |
+
"transformers_version": "4.25.1",
|
44 |
+
"type_vocab_size": 2,
|
45 |
+
"use_cache": true,
|
46 |
+
"vocab_size": 50000
|
47 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "2.2.2",
|
4 |
+
"transformers": "4.25.1",
|
5 |
+
"pytorch": "1.13.1+cu116"
|
6 |
+
}
|
7 |
+
}
|
eval/similarity_evaluation_sts-dev_results.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
|
2 |
+
0,64,0.01889159788729281,0.016624999289029295,0.03469793378231501,0.017171827743118543,0.036318400873366054,0.018583970685317548,0.004526911551087426,0.0032431555058285238
|
3 |
+
0,128,0.022875577846926422,0.02174170637990998,0.03954375146428044,0.022328548982320333,0.04091356539768457,0.023862140128058987,0.009440222942678963,0.009304684394661674
|
4 |
+
0,192,0.01387889539267916,0.011667832952411703,0.032260116008321994,0.014945492615462254,0.033935165768639024,0.01645172093143081,-0.002182766960806109,-0.0017655002863984455
|
5 |
+
0,256,0.01828523647538354,0.016341884358734068,0.036781238263772824,0.02040151949756355,0.03792863232775403,0.021215023956803376,0.00035982769085127144,-5.889061530307879e-05
|
6 |
+
0,320,0.012641365631522607,0.011684845924611569,0.032758269647456324,0.01603228731714641,0.03382800484767004,0.016835889943854928,-0.0026152727867795895,-0.0012867775061407426
|
7 |
+
0,384,0.014237791942630909,0.011397548550683043,0.032379164868574596,0.015113215221115445,0.03339659541400877,0.01596477329797923,-1.451163034025292e-05,0.0001286957913938935
|
8 |
+
0,448,0.013316602487679833,0.010643519895732825,0.031116715761429405,0.014039553163540055,0.032519563320689275,0.01518211692168955,-0.0006667334600426175,-0.00047201737871873685
|
9 |
+
0,512,0.012129810457018518,0.00915920347695669,0.030580314866465282,0.013329280142078249,0.031828637208115206,0.01423172931123735,-0.0019343597440829874,-0.002409726630392591
|
10 |
+
0,576,0.012643408389483751,0.0099651818342994,0.03083155144046526,0.013689347205845947,0.03205055544312211,0.014569118519958601,-0.0014216084984591919,-0.0022294500998564252
|
11 |
+
0,640,0.012239800823974158,0.009318939736809147,0.030432664638052977,0.013133601134857677,0.03166092534492111,0.014084817348849227,-0.001762226715672962,-0.0025439156403049275
|
12 |
+
0,-1,0.012225819579360945,0.009298663370442524,0.030418787308967545,0.013091582733741967,0.031646576568198,0.014064586263107763,-0.0017720743898674263,-0.0025531051846128
|
get-pip.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad1c6cf505822592f24f9736e75c48a0323cbc1d2233aa154dbf7380ce5c8f98
|
3 |
+
size 497836589
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 75,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
similarity.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from sentence_transformers import SentenceTransformer, util
|
3 |
+
|
4 |
+
# Path to your model `.bin` file
|
5 |
+
model_path = "pytorch_model.bin"
|
6 |
+
|
7 |
+
# Path to your tokenizer `.json` file
|
8 |
+
tokenizer_path = "tokenizer.json"
|
9 |
+
|
10 |
+
# Load the model
|
11 |
+
model = SentenceTransformer(model_path)
|
12 |
+
|
13 |
+
# Load the tokenizer
|
14 |
+
tokenizer = util.load_tokenizer(tokenizer_path)
|
15 |
+
|
16 |
+
# Your sentences
|
17 |
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
18 |
+
|
19 |
+
# Preprocess the sentences using the tokenizer
|
20 |
+
encoded_sentences = tokenizer.encode(sentences, batch_size=None, return_tensors="pt")
|
21 |
+
|
22 |
+
# Get the embeddings from the model
|
23 |
+
embeddings = model(encoded_sentences)
|
24 |
+
|
25 |
+
# Print the embeddings
|
26 |
+
print(embeddings)
|
similarity_evaluation_sts-test_results.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
|
2 |
+
-1,-1,0.012225819579360945,0.009298663370442524,0.030418787308967545,0.013091582733741967,0.031646576568198,0.014064586263107763,-0.0017720743898674263,-0.0025531051846128
|
3 |
+
-1,-1,0.012225819579360945,0.009298663370442524,0.030418787308967545,0.013091582733741967,0.031646576568198,0.014064586263107763,-0.0017720743898674263,-0.0025531051846128
|
4 |
+
-1,-1,0.022875577846926422,0.02174170637990998,0.03954375146428044,0.022328548982320333,0.04091356539768457,0.023862140128058987,0.009440222942678963,0.009304684394661674
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": true,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"model_max_length": 1000000000000000019884624838656,
|
7 |
+
"name_or_path": "indobenchmark/indobert-base-p2",
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"special_tokens_map_file": "/root/.cache/huggingface/hub/models--indobenchmark--indobert-base-p2/snapshots/94b4e0a82081fa57f227fcc2024d1ea89b57ac1f/special_tokens_map.json",
|
12 |
+
"strip_accents": null,
|
13 |
+
"tokenize_chinese_chars": true,
|
14 |
+
"tokenizer_class": "BertTokenizer",
|
15 |
+
"unk_token": "[UNK]"
|
16 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|