m3hrdadfi commited on
Commit
eb3d3e8
1 Parent(s): 4424aa1
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
README.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ - transformers
8
+
9
+ ---
10
+
11
+ # {MODEL_NAME}
12
+
13
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
14
+
15
+ <!--- Describe your model here -->
16
+
17
+ ## Usage (Sentence-Transformers)
18
+
19
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
20
+
21
+ ```
22
+ pip install -U sentence-transformers
23
+ ```
24
+
25
+ Then you can use the model like this:
26
+
27
+ ```python
28
+ from sentence_transformers import SentenceTransformer
29
+ sentences = ["This is an example sentence", "Each sentence is converted"]
30
+
31
+ model = SentenceTransformer('{MODEL_NAME}')
32
+ embeddings = model.encode(sentences)
33
+ print(embeddings)
34
+ ```
35
+
36
+
37
+
38
+ ## Usage (HuggingFace Transformers)
39
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
40
+
41
+ ```python
42
+ from transformers import AutoTokenizer, AutoModel
43
+ import torch
44
+
45
+
46
+ #Mean Pooling - Take attention mask into account for correct averaging
47
+ def mean_pooling(model_output, attention_mask):
48
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
49
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
50
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
51
+
52
+
53
+ # Sentences we want sentence embeddings for
54
+ sentences = ['This is an example sentence', 'Each sentence is converted']
55
+
56
+ # Load model from HuggingFace Hub
57
+ tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
58
+ model = AutoModel.from_pretrained('{MODEL_NAME}')
59
+
60
+ # Tokenize sentences
61
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
62
+
63
+ # Compute token embeddings
64
+ with torch.no_grad():
65
+ model_output = model(**encoded_input)
66
+
67
+ # Perform pooling. In this case, mean pooling.
68
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
69
+
70
+ print("Sentence embeddings:")
71
+ print(sentence_embeddings)
72
+ ```
73
+
74
+
75
+
76
+ ## Evaluation Results
77
+
78
+ <!--- Describe how your model was evaluated -->
79
+
80
+ For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
81
+
82
+
83
+ ## Training
84
+ The model was trained with the parameters:
85
+
86
+ **DataLoader**:
87
+
88
+ `torch.utils.data.dataloader.DataLoader` of length 13069 with parameters:
89
+ ```
90
+ {'batch_size': 32, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
91
+ ```
92
+
93
+ **Loss**:
94
+
95
+ `sentence_transformers.losses.ContrastiveLoss.ContrastiveLoss` with parameters:
96
+ ```
97
+ {'distance_metric': 'SiameseDistanceMetric.COSINE_DISTANCE', 'margin': 0.5, 'size_average': True}
98
+ ```
99
+
100
+ Parameters of the fit()-Method:
101
+ ```
102
+ {
103
+ "epochs": 1,
104
+ "evaluation_steps": 1000,
105
+ "evaluator": "sentence_transformers.evaluation.BinaryClassificationEvaluator.BinaryClassificationEvaluator",
106
+ "max_grad_norm": 1,
107
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
108
+ "optimizer_params": {
109
+ "lr": 2e-05
110
+ },
111
+ "scheduler": "WarmupLinear",
112
+ "steps_per_epoch": null,
113
+ "warmup_steps": 100,
114
+ "weight_decay": 0.01
115
+ }
116
+ ```
117
+
118
+
119
+ ## Full Model Architecture
120
+ ```
121
+ SentenceTransformer(
122
+ (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
123
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
124
+ )
125
+ ```
126
+
127
+ ## Citing & Authors
128
+
129
+ <!--- Describe where people can find more information -->
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pritamdeka/S-PubMedBert-MS-MARCO",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.28.0",
5
+ "pytorch": "1.12.1"
6
+ }
7
+ }
embeddings/test_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0040f61e1eb17648ca22e90c0542472fb8eb83598ecbb9a90efe4ad5f6b00e02
3
+ size 462671600
embeddings/train_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbacff5b7eafbd05c2b36de0388aa8ad51dcec91a2e3f17a30480407627b5ff
3
+ size 107131502
eval/binary_classification_evaluation_results.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,steps,cossim_accuracy,cossim_accuracy_threshold,cossim_f1,cossim_precision,cossim_recall,cossim_f1_threshold,cossim_ap,manhattan_accuracy,manhattan_accuracy_threshold,manhattan_f1,manhattan_precision,manhattan_recall,manhattan_f1_threshold,manhattan_ap,euclidean_accuracy,euclidean_accuracy_threshold,euclidean_f1,euclidean_precision,euclidean_recall,euclidean_f1_threshold,euclidean_ap,dot_accuracy,dot_accuracy_threshold,dot_f1,dot_precision,dot_recall,dot_f1_threshold,dot_ap
2
+ 0,1000,0.9959898418892938,0.8138036131858826,0.7075564278704614,0.6578467153284672,0.7653927813163482,0.7513059973716736,0.7341425309445677,0.9959970156962897,206.19004821777344,0.708124373119358,0.6711026615969582,0.7494692144373672,234.30462646484375,0.7343640062968387,0.995982668082298,9.600546836853027,0.7077856420626895,0.6756756756756757,0.7430997876857749,10.721283912658691,0.7343343474969306,0.995982668082298,195.89633178710938,0.708086785009862,0.6611418047882136,0.7622080679405521,185.87686157226562,0.7307107673692264
3
+ 0,2000,0.9964130965020517,0.7086958885192871,0.7343991748323879,0.7141424272818455,0.7558386411889597,0.6719770431518555,0.7704567943717955,0.9964417917300353,252.49545288085938,0.7380339680905815,0.7162837162837162,0.7611464968152867,274.32293701171875,0.7695558903687251,0.9964130965020517,11.899690628051758,0.7347781217750258,0.714859437751004,0.7558386411889597,12.669260025024414,0.7706420762152746,0.9964130965020517,170.9678192138672,0.7385250128932439,0.7181544633901705,0.7600849256900213,163.7506103515625,0.7683834051174947
4
+ 0,3000,0.9963413584320927,0.723868727684021,0.727364693980779,0.6946859903381642,0.7632696390658175,0.6570333242416382,0.7613772606209377,0.9963270108181009,255.80694580078125,0.7232985593641331,0.6797385620915033,0.772823779193206,286.2681884765625,0.7568305543156388,0.9963341846250968,11.572385787963867,0.7247106190236539,0.6889952153110048,0.7643312101910829,13.055124282836914,0.7602415339756503,0.9963485322390886,177.50393676757812,0.7294956698930208,0.7012732615083251,0.7600849256900213,161.96173095703125,0.7613816674179076
5
+ 0,4000,0.9964489655370312,0.7342211008071899,0.7357894736842107,0.7296450939457203,0.7420382165605095,0.6717803478240967,0.7695260651531055,0.9964274441160434,259.0006103515625,0.7328405491024287,0.7289915966386554,0.7367303609341825,284.18011474609375,0.7697369150423352,0.996463313151023,11.66153621673584,0.7363972530375065,0.732912723449001,0.7399150743099787,12.891582489013672,0.7693784186219063,0.9964202703090476,187.724609375,0.7358987875593042,0.7308900523560209,0.7409766454352441,171.3360595703125,0.766883691420442
6
+ 0,5000,0.9963987488880599,0.8664075136184692,0.7335747542679772,0.715438950554995,0.7526539278131635,0.8242816925048828,0.7735817047183088,0.996391575081064,185.98974609375,0.7345309381237526,0.6930320150659134,0.7813163481953291,223.56268310546875,0.773388317040438,0.9964059226950558,8.383909225463867,0.7339632023868721,0.6903648269410664,0.7834394904458599,10.310904502868652,0.7742759409863931,0.9963054893971133,224.58929443359375,0.7315403422982885,0.6781504986400725,0.7940552016985138,202.34378051757812,0.767881563554214
7
+ 0,6000,0.9964991821860024,0.7059401273727417,0.7367357251136938,0.7029893924783028,0.7738853503184714,0.6457751989364624,0.7733515138935646,0.9965063559929984,272.0797119140625,0.7391304347826088,0.7055984555984556,0.7760084925690022,297.36431884765625,0.7736052471730748,0.9965063559929984,12.334430694580078,0.7384926656550329,0.7053140096618358,0.7749469214437368,13.425590515136719,0.7740391212270996,0.9964489655370312,180.75814819335938,0.735868448098664,0.7131474103585658,0.7600849256900213,167.7847900390625,0.7664293324829508
8
+ 0,7000,0.996535051220982,0.7890695929527283,0.7413280475718533,0.6951672862453532,0.7940552016985138,0.7123516798019409,0.7841125045958415,0.9965278774139861,226.22021484375,0.7426871591472484,0.6967441860465117,0.7951167728237792,269.11865234375,0.7838650351275516,0.9965278774139861,10.408506393432617,0.7424015944195317,0.6995305164319249,0.7908704883227177,12.124641418457031,0.784355194377901,0.9964991821860024,202.1366424560547,0.7413526071244192,0.721608040201005,0.7622080679405521,192.69488525390625,0.7797219258535517
9
+ 0,8000,0.9964059226950558,0.8952984809875488,0.7305389221556886,0.6892655367231638,0.7770700636942676,0.8116875886917114,0.7704874758131416,0.9964202703090476,163.6337890625,0.7302697302697304,0.689622641509434,0.7760084925690022,215.971923828125,0.7709341668481307,0.9964130965020517,7.347511291503906,0.7310756972111554,0.6885553470919324,0.7791932059447984,9.950638771057129,0.7708456778392645,0.9963772274670722,231.33460998535156,0.7294233612617054,0.6807727690892365,0.7855626326963907,207.47158813476562,0.763934166072386
10
+ 0,9000,0.9966354845189245,0.7133514881134033,0.7461773700305812,0.7176470588235294,0.7770700636942676,0.635808527469635,0.7876558208456006,0.9966139630979368,265.9307861328125,0.7446592065106816,0.71484375,0.7770700636942676,304.4968566894531,0.7860638579563115,0.9966211369049327,12.278663635253906,0.7447570332480818,0.7186574531095755,0.772823779193206,13.809059143066406,0.7869908698982819,0.9966139630979368,187.80516052246094,0.746055979643766,0.7165200391006843,0.778131634819533,166.61741638183594,0.7862214926422617
11
+ 0,10000,0.9965709202559614,0.8336483836174011,0.7468160978094752,0.7179236043095005,0.778131634819533,0.7230831384658813,0.784005730743692,0.9965565726419696,189.58778381347656,0.7469387755102042,0.7190569744597249,0.7770700636942676,262.7763366699219,0.7827262424160313,0.9965709202559614,9.668187141418457,0.7465578786333504,0.718351324828263,0.7770700636942676,11.99598503112793,0.7836456150853454,0.9965852678699533,219.69346618652344,0.748347737671581,0.7180487804878048,0.7813163481953291,188.4782257080078,0.7823056568481427
12
+ 0,11000,0.996599615483945,0.8404154777526855,0.7460159362549802,0.702626641651032,0.7951167728237792,0.7394824028015137,0.7891535629187694,0.9965924416769492,201.5096893310547,0.7464162135442413,0.6984273820536541,0.8014861995753716,258.68951416015625,0.7892201646297157,0.9966067892909409,9.137676239013672,0.7464019851116626,0.700838769804287,0.7983014861995754,11.708337783813477,0.7896685781188241,0.996599615483945,219.41256713867188,0.7466266866566718,0.7053824362606232,0.7929936305732485,194.33685302734375,0.7847708090846266
13
+ 0,12000,0.9966641797469081,0.7714364528656006,0.7470817120622569,0.6894075403949731,0.8152866242038217,0.6613130569458008,0.7971383527901929,0.996671353553904,238.64959716796875,0.7465753424657534,0.6923774954627949,0.8099787685774947,292.0980224609375,0.7963038566318145,0.996671353553904,10.962160110473633,0.7463414634146343,0.6904332129963899,0.8121019108280255,13.26413345336914,0.7973728787437269,0.9966570059399122,201.703369140625,0.747903305377405,0.6986175115207374,0.8046709129511678,175.73724365234375,0.7950255951425332
14
+ 0,13000,0.9966641797469081,0.7876771688461304,0.7506100536847242,0.6946702800361337,0.8163481953290871,0.6692169904708862,0.7981447335724123,0.9966641797469081,221.31605529785156,0.7484307098020281,0.6864481842338352,0.8227176220806794,295.13555908203125,0.7973206822930992,0.9966570059399122,10.126941680908203,0.7497584541062802,0.6879432624113475,0.8237791932059448,13.3380126953125,0.798292184445257,0.9966785273608999,205.54551696777344,0.7515861395802832,0.6955736224028907,0.8174097664543525,175.16177368164062,0.7963508054260641
15
+ 0,-1,0.9966641797469081,0.788578450679779,0.7506100536847242,0.6946702800361337,0.8163481953290871,0.6699733734130859,0.7981344463284198,0.9966641797469081,222.07940673828125,0.7485493230174081,0.6873889875666075,0.821656050955414,294.3916015625,0.7973874001838552,0.9966570059399122,10.110201835632324,0.7497584541062802,0.6879432624113475,0.8237791932059448,13.3286714553833,0.7982984487379056,0.9966785273608999,205.70175170898438,0.7515861395802832,0.6955736224028907,0.8174097664543525,175.29234313964844,0.7963482067955843
eval/scores/df_generic_eval_top_n.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63f29cf8e6d6eb9ed1f0d1d964992170c872b8ff8d2abe0d856192129096afd7
3
+ size 462674027
eval/scores/scores_generic_eval_top_n.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"accuracy": 0.9974451807781612, "recall": 0.7957110609480813, "precision": 0.7957110609480813, "F1_score": 0.7957110609480813, "TP": 705, "FP": 181, "TN": 140626, "FN": 181}{"model": "/mimer/NOBACKUP/groups/addcell/biomedical-data-extraction/sentence_transformers/data/models/general_query_all/", "save_dir": "data/models", "test_dataset": "/mimer/NOBACKUP/groups/addcell/biomedical-data-extraction/sentence_transformers/data/test_20.pkl", "train_dataset": "/mimer/NOBACKUP/groups/addcell/biomedical-data-extraction/sentence_transformers/data/train_80.pkl", "query": "Gene knockout from cell", "tsne_samples": "-1", "eval_opt": "top_n", "verbose": "2"}
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d365451abc2ab4f3928d58f5054dbf7076ac7b069b1d2e3cd3c5daa40300d29
3
+ size 437998385
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
training_arguments.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model": "pritamdeka/S-PubMedBert-MS-MARCO", "dataset": "/mimer/NOBACKUP/groups/addcell/biomedical-data-extraction/sentence_transformers/data/prepared_data/Annotated_dataset/general_query/", "verbose": "2", "evaluator": "binary", "evaluation_steps": "1000", "batch_size": "32", "epochs": "1", "warmup": "100", "lr": "2e-05", "loss_function": "contrastive", "output_dir": "/mimer/NOBACKUP/groups/addcell/biomedical-data-extraction/sentence_transformers/data/models/general_query_all/", "token": "-1", "device_name": "NVIDIA A40", "training_time_seconds": 4652.426364183426}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff