maneprajakta commited on
Commit
ba71f41
·
1 Parent(s): bacc348

Upload 11 files

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ - transformers
8
+
9
+ ---
10
+
11
+ # {MODEL_NAME}
12
+
13
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
14
+
15
+ <!--- Describe your model here -->
16
+
17
+ ## Usage (Sentence-Transformers)
18
+
19
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
20
+
21
+ ```
22
+ pip install -U sentence-transformers
23
+ ```
24
+
25
+ Then you can use the model like this:
26
+
27
+ ```python
28
+ from sentence_transformers import SentenceTransformer
29
+ sentences = ["This is an example sentence", "Each sentence is converted"]
30
+
31
+ model = SentenceTransformer('{MODEL_NAME}')
32
+ embeddings = model.encode(sentences)
33
+ print(embeddings)
34
+ ```
35
+
36
+
37
+
38
+ ## Usage (HuggingFace Transformers)
39
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
40
+
41
+ ```python
42
+ from transformers import AutoTokenizer, AutoModel
43
+ import torch
44
+
45
+
46
+ #Mean Pooling - Take attention mask into account for correct averaging
47
+ def mean_pooling(model_output, attention_mask):
48
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
49
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
50
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
51
+
52
+
53
+ # Sentences we want sentence embeddings for
54
+ sentences = ['This is an example sentence', 'Each sentence is converted']
55
+
56
+ # Load model from HuggingFace Hub
57
+ tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
58
+ model = AutoModel.from_pretrained('{MODEL_NAME}')
59
+
60
+ # Tokenize sentences
61
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
62
+
63
+ # Compute token embeddings
64
+ with torch.no_grad():
65
+ model_output = model(**encoded_input)
66
+
67
+ # Perform pooling. In this case, mean pooling.
68
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
69
+
70
+ print("Sentence embeddings:")
71
+ print(sentence_embeddings)
72
+ ```
73
+
74
+
75
+
76
+ ## Evaluation Results
77
+
78
+ <!--- Describe how your model was evaluated -->
79
+
80
+ For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
81
+
82
+
83
+
84
+ ## Full Model Architecture
85
+ ```
86
+ SentenceTransformer(
87
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel
88
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
89
+ )
90
+ ```
91
+
92
+ ## Citing & Authors
93
+
94
+ <!--- Describe where people can find more information -->
added_tokens.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "X509": 30611,
3
+ "aad": 30627,
4
+ "acl": 30559,
5
+ "acm": 30597,
6
+ "alpn": 30601,
7
+ "arn": 30591,
8
+ "asr": 30532,
9
+ "auc": 30562,
10
+ "auth": 30593,
11
+ "aws": 30547,
12
+ "bgp": 30622,
13
+ "cdk": 30545,
14
+ "cdn": 30542,
15
+ "chime": 30535,
16
+ "cidr": 30617,
17
+ "cidr ": 30603,
18
+ "cli": 30595,
19
+ "cloud9": 30539,
20
+ "cloudfront": 30620,
21
+ "cloudhsm": 30625,
22
+ "cloudquery": 30587,
23
+ "cloudtrail": 30546,
24
+ "cloudwatch": 30629,
25
+ "cmk": 30602,
26
+ "cognito": 30612,
27
+ "config": 30596,
28
+ "csv": 30581,
29
+ "devops": 30580,
30
+ "dhcp": 30623,
31
+ "dms": 30607,
32
+ "dns": 30530,
33
+ "docker": 30579,
34
+ "ebs": 30569,
35
+ "ec2": 30527,
36
+ "ecr": 30557,
37
+ "ecs": 30550,
38
+ "efs": 30564,
39
+ "eks": 30592,
40
+ "elb": 30574,
41
+ "emr": 30537,
42
+ "etl": 30567,
43
+ "git": 30570,
44
+ "github": 30621,
45
+ "gpu": 30568,
46
+ "gw": 30614,
47
+ "hdfs": 30552,
48
+ "hmac": 30558,
49
+ "hpc": 30531,
50
+ "hrnn": 30630,
51
+ "hsm": 30615,
52
+ "hvm": 30624,
53
+ "iam": 30541,
54
+ "iops": 30582,
55
+ "ipsec": 30565,
56
+ "ipv4": 30590,
57
+ "ipv6": 30589,
58
+ "isp": 30560,
59
+ "isps": 30563,
60
+ "itsm": 30585,
61
+ "kinesis": 30609,
62
+ "kms": 30584,
63
+ "kubernetes": 30594,
64
+ "mime": 30631,
65
+ "mysql": 30572,
66
+ "namespace": 30613,
67
+ "nif": 30604,
68
+ "nosql": 30577,
69
+ "oai": 30534,
70
+ "rds": 30544,
71
+ "redis": 30533,
72
+ "redshift": 30529,
73
+ "route53": 30619,
74
+ "s3": 30538,
75
+ "saas": 30551,
76
+ "saml": 30555,
77
+ "scp": 30543,
78
+ "sct": 30540,
79
+ "sdk": 30536,
80
+ "snapshot": 30606,
81
+ "sns": 30586,
82
+ "spf": 30578,
83
+ "sqs": 30571,
84
+ "sse": 30626,
85
+ "ssh": 30528,
86
+ "ssl": 30583,
87
+ "ssm": 30588,
88
+ "subnet": 30566,
89
+ "swf": 30561,
90
+ "tcp": 30610,
91
+ "tgw": 30616,
92
+ "throttling": 30554,
93
+ "tls": 30573,
94
+ "upt": 30600,
95
+ "uptycs": 30618,
96
+ "url": 30605,
97
+ "vgw": 30576,
98
+ "vm": 30575,
99
+ "vms": 30549,
100
+ "vmware": 30548,
101
+ "vpc": 30556,
102
+ "vpn": 30553,
103
+ "waf": 30598,
104
+ "wam": 30628,
105
+ "webacl": 30608,
106
+ "wsdl": 30599
107
+ }
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/pmane/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-dot-v1/",
3
+ "architectures": [
4
+ "MPNetModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "mpnet",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "relative_attention_num_buckets": 32,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.29.2",
23
+ "vocab_size": 30632
24
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.29.2",
5
+ "pytorch": "2.0.1+cpu"
6
+ }
7
+ }
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "[UNK]"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "do_lower_case": true,
6
+ "eos_token": "</s>",
7
+ "mask_token": "<mask>",
8
+ "model_max_length": 512,
9
+ "pad_token": "<pad>",
10
+ "sep_token": "</s>",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "MPNetTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff