Shuu12121 commited on
Commit
61496a9
·
verified ·
1 Parent(s): cf85670

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +90 -543
README.md CHANGED
@@ -1,577 +1,124 @@
1
  ---
2
  tags:
3
- - sentence-transformers
4
- - sentence-similarity
5
- - feature-extraction
6
- - generated_from_trainer
7
- - dataset_size:2732400
8
- - loss:MultipleNegativesRankingLoss
9
- base_model: Shuu12121/CodeModernBERT-Owl-2.0
10
- widget:
11
- - source_sentence: 'put string value.
12
-
13
-
14
- @param body _.
15
-
16
- @throws IllegalArgumentException thrown if parameters fail the validation.
17
-
18
- @throws HttpResponseException thrown if the service returns an error.
19
-
20
- @throws RuntimeException all other wrapped checked exceptions if the request fails
21
- to be sent.'
22
- sentences:
23
- - "func ComposeSSHCloneURL(doer *user_model.User, ownerName, repoName string) string\
24
- \ {\n\tsshUser := setting.SSH.User\n\tsshDomain := setting.SSH.Domain\n\n\tif\
25
- \ sshUser == \"(DOER_USERNAME)\" {\n\t\t// Some users use SSH reverse-proxy and\
26
- \ need to use the current signed-in username as the SSH user\n\t\t// to make the\
27
- \ SSH reverse-proxy could prepare the user's public keys ahead.\n\t\t// For most\
28
- \ cases we have the correct \"doer\", then use it as the SSH user.\n\t\t// If\
29
- \ we can't get the doer, then use the built-in SSH user.\n\t\tif doer != nil {\n\
30
- \t\t\tsshUser = doer.Name\n\t\t} else {\n\t\t\tsshUser = setting.SSH.BuiltinServerUser\n\
31
- \t\t}\n\t}\n\n\t// non-standard port, it must use full URI\n\tif setting.SSH.Port\
32
- \ != 22 {\n\t\tsshHost := net.JoinHostPort(sshDomain, strconv.Itoa(setting.SSH.Port))\n\
33
- \t\treturn fmt.Sprintf(\"ssh://%s@%s/%s/%s.git\", sshUser, sshHost, url.PathEscape(ownerName),\
34
- \ url.PathEscape(repoName))\n\t}\n\n\t// for standard port, it can use a shorter\
35
- \ URI (without the port)\n\tsshHost := sshDomain\n\tif ip := net.ParseIP(sshHost);\
36
- \ ip != nil && ip.To4() == nil {\n\t\tsshHost = \"[\" + sshHost + \"]\" // for\
37
- \ IPv6 address, wrap it with brackets\n\t}\n\tif setting.Repository.UseCompatSSHURI\
38
- \ {\n\t\treturn fmt.Sprintf(\"ssh://%s@%s/%s/%s.git\", sshUser, sshHost, url.PathEscape(ownerName),\
39
- \ url.PathEscape(repoName))\n\t}\n\treturn fmt.Sprintf(\"%s@%s:%s/%s.git\", sshUser,\
40
- \ sshHost, url.PathEscape(ownerName), url.PathEscape(repoName))\n}"
41
- - "@java.lang.Override\n public boolean hasFieldExtractionMetadata() {\n return\
42
- \ ((bitField0_ & 0x00000001) != 0);\n }"
43
- - "@Metadata(properties = { MetadataProperties.GENERATED })\n @ServiceMethod(returns\
44
- \ = ReturnType.SINGLE)\n public void put(String body) {\n this.serviceClient.put(body);\n\
45
- \ }"
46
- - source_sentence: 'Optional. User specified ID for the notebook runtime.
47
-
48
-
49
- Generated from protobuf field <code>string notebook_runtime_id = 4 [(.google.api.field_behavior)
50
- = OPTIONAL];</code>
51
-
52
- @return string'
53
- sentences:
54
- - "public function getNotebookRuntimeId()\n {\n return $this->notebook_runtime_id;\n\
55
- \ }"
56
- - "func (client *BlobContainersClient) BeginObjectLevelWorm(ctx context.Context,\
57
- \ resourceGroupName string, accountName string, containerName string, options\
58
- \ *BlobContainersClientBeginObjectLevelWormOptions) (*runtime.Poller[BlobContainersClientObjectLevelWormResponse],\
59
- \ error) {\n\tif options == nil || options.ResumeToken == \"\" {\n\t\tresp, err\
60
- \ := client.objectLevelWorm(ctx, resourceGroupName, accountName, containerName,\
61
- \ options)\n\t\tif err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\tpoller, err\
62
- \ := runtime.NewPoller(resp, client.internal.Pipeline(), &runtime.NewPollerOptions[BlobContainersClientObjectLevelWormResponse]{\n\
63
- \t\t\tFinalStateVia: runtime.FinalStateViaLocation,\n\t\t\tTracer: client.internal.Tracer(),\n\
64
- \t\t})\n\t\treturn poller, err\n\t} else {\n\t\treturn runtime.NewPollerFromResumeToken(options.ResumeToken,\
65
- \ client.internal.Pipeline(), &runtime.NewPollerFromResumeTokenOptions[BlobContainersClientObjectLevelWormResponse]{\n\
66
- \t\t\tTracer: client.internal.Tracer(),\n\t\t})\n\t}\n}"
67
- - "def version(self) -> Union[int, str]:\n \n if self._version is\
68
- \ None:\n self._version = self._get_next_version()\n return\
69
- \ self._version"
70
- - source_sentence: '<pre>
71
-
72
- Output only. An email message received in reply to the case.
73
-
74
- </pre>
75
-
76
-
77
- <code>
78
-
79
- .google.cloud.support.v2beta.EmailMessage email_message = 102 [(.google.api.field_behavior)
80
- = OUTPUT_ONLY];
81
-
82
- </code>
83
-
84
-
85
- @return The emailMessage.'
86
- sentences:
87
- - "@java.lang.Override\n public com.google.cloud.support.v2beta.EmailMessage getEmailMessage()\
88
- \ {\n if (eventObjectCase_ == 102) {\n return (com.google.cloud.support.v2beta.EmailMessage)\
89
- \ eventObject_;\n }\n return com.google.cloud.support.v2beta.EmailMessage.getDefaultInstance();\n\
90
- \ }"
91
- - "def df_isin(df, values):\n \n if is_list_like(values) and not isinstance(values,\
92
- \ dict):\n values = list(values)\n elif not isinstance(\n values,\
93
- \ (SERIES_TYPE, DATAFRAME_TYPE, TENSOR_TYPE, INDEX_TYPE, dict)\n ):\n \
94
- \ raise TypeError(\n \"only list-like objects or dict are allowed\
95
- \ to be passed to isin(), \"\n f\"you passed a [{type(values)}]\"\n\
96
- \ )\n op = DataFrameIsin(values=values)\n return op(df)"
97
- - "public function getModelDeploymentMonitoringJobs()\n {\n return $this->model_deployment_monitoring_jobs;\n\
98
- \ }"
99
- - source_sentence: Compute the maximum violation of KKT conditions.
100
- sentences:
101
- - "def with_url(self,raw_url: str) -> SearchesRequestBuilder:\n \n \
102
- \ if raw_url is None:\n raise TypeError(\"raw_url cannot be null.\"\
103
- )\n return SearchesRequestBuilder(self.request_adapter, raw_url)"
104
- - "def get_subscription\n # Create a client object. The client can be reused for\
105
- \ multiple calls.\n client = Google::Apps::Events::Subscriptions::V1::SubscriptionsService::Client.new\n\
106
- \n # Create a request. To set request fields, pass in keyword arguments.\n request\
107
- \ = Google::Apps::Events::Subscriptions::V1::GetSubscriptionRequest.new\n\n #\
108
- \ Call the get_subscription method.\n result = client.get_subscription request\n\
109
- \n # The returned object is of type Google::Apps::Events::Subscriptions::V1::Subscription.\n\
110
- \ p result\nend"
111
- - "def compute_kkt_optimality(g, on_bound):\n \n g_kkt = g * on_bound\n \
112
- \ free_set = on_bound == 0\n g_kkt[free_set] = np.abs(g[free_set])\n return\
113
- \ np.max(g_kkt)"
114
- - source_sentence: 'Creates a unary expression NEGATIVE
115
-
116
-
117
- # Errors
118
-
119
-
120
- This function errors when the argument''s type is not signed numeric'
121
- sentences:
122
- - "public function searchItemAction()\n {\n return $this->searchBase(\"\
123
- ifgroupentry\", ['ifname', 'descr', 'members', 'sequence'], \"ifname\");\n \
124
- \ }"
125
- - "pub fn poller(self) -> impl lro::Poller<(), crate::model::DeleteSitemapMetadata>\
126
- \ {\n type Operation =\n lro::internal::Operation<wkt::Empty,\
127
- \ crate::model::DeleteSitemapMetadata>;\n let polling_error_policy\
128
- \ = self.0.stub.get_polling_error_policy(&self.0.options);\n let polling_backoff_policy\
129
- \ = self.0.stub.get_polling_backoff_policy(&self.0.options);\n\n let\
130
- \ stub = self.0.stub.clone();\n let mut options = self.0.options.clone();\n\
131
- \ options.set_retry_policy(gax::retry_policy::NeverRetry);\n \
132
- \ let query = move |name| {\n let stub = stub.clone();\n \
133
- \ let options = options.clone();\n async {\n \
134
- \ let op = GetOperation::new(stub)\n .set_name(name)\n\
135
- \ .with_options(options)\n .send()\n\
136
- \ .await?;\n Ok(Operation::new(op))\n\
137
- \ }\n };\n\n let start = move || async {\n\
138
- \ let op = self.send().await?;\n Ok(Operation::new(op))\n\
139
- \ };\n\n lro::internal::new_unit_response_poller(\n \
140
- \ polling_error_policy,\n polling_backoff_policy,\n\
141
- \ start,\n query,\n )\n }"
142
- - "pub fn negative(\n arg: Arc<dyn PhysicalExpr>,\n input_schema: &Schema,\n\
143
- ) -> Result<Arc<dyn PhysicalExpr>> {\n let data_type = arg.data_type(input_schema)?;\n\
144
- \ if !coercion::is_signed_numeric(&data_type) {\n Err(DataFusionError::Internal(\n\
145
- \ format!(\n \"(- '{:?}') can't be evaluated because\
146
- \ the expression's type is {:?}, not signed numeric\",\n arg, data_type,\n\
147
- \ ),\n ))\n } else {\n Ok(Arc::new(NegativeExpr::new(arg)))\n\
148
- \ }\n}"
149
  pipeline_tag: sentence-similarity
150
  library_name: sentence-transformers
151
- ---
152
-
153
- # SentenceTransformer based on Shuu12121/CodeModernBERT-Owl-2.0
 
 
 
 
 
 
 
 
 
154
 
155
- This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Shuu12121/CodeModernBERT-Owl-2.0](https://huggingface.co/Shuu12121/CodeModernBERT-Owl-2.0). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
156
-
157
- ## Model Details
158
-
159
- ### Model Description
160
- - **Model Type:** Sentence Transformer
161
- - **Base model:** [Shuu12121/CodeModernBERT-Owl-2.0](https://huggingface.co/Shuu12121/CodeModernBERT-Owl-2.0) <!-- at revision a6f43b644188b4e7fe211f38003c7742218607c0 -->
162
- - **Maximum Sequence Length:** 1024 tokens
163
- - **Output Dimensionality:** 768 dimensions
164
- - **Similarity Function:** Cosine Similarity
165
- <!-- - **Training Dataset:** Unknown -->
166
- <!-- - **Language:** Unknown -->
167
- <!-- - **License:** Unknown -->
168
-
169
- ### Model Sources
170
-
171
- - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
172
- - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
173
- - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
174
-
175
- ### Full Model Architecture
176
-
177
- ```
178
- SentenceTransformer(
179
- (0): Transformer({'max_seq_length': 1024, 'do_lower_case': False}) with Transformer model: ModernBertModel
180
- (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
181
- )
182
- ```
183
-
184
- ## Usage
185
-
186
- ### Direct Usage (Sentence Transformers)
187
-
188
- First install the Sentence Transformers library:
189
-
190
- ```bash
191
- pip install -U sentence-transformers
192
- ```
193
-
194
- Then you can load this model and run inference.
195
- ```python
196
- from sentence_transformers import SentenceTransformer
197
-
198
- # Download from the 🤗 Hub
199
- model = SentenceTransformer("sentence_transformers_model_id")
200
- # Run inference
201
- sentences = [
202
- "Creates a unary expression NEGATIVE\n\n# Errors\n\nThis function errors when the argument's type is not signed numeric",
203
- 'pub fn negative(\n arg: Arc<dyn PhysicalExpr>,\n input_schema: &Schema,\n) -> Result<Arc<dyn PhysicalExpr>> {\n let data_type = arg.data_type(input_schema)?;\n if !coercion::is_signed_numeric(&data_type) {\n Err(DataFusionError::Internal(\n format!(\n "(- \'{:?}\') can\'t be evaluated because the expression\'s type is {:?}, not signed numeric",\n arg, data_type,\n ),\n ))\n } else {\n Ok(Arc::new(NegativeExpr::new(arg)))\n }\n}',
204
- 'public function searchItemAction()\n {\n return $this->searchBase("ifgroupentry", [\'ifname\', \'descr\', \'members\', \'sequence\'], "ifname");\n }',
205
- ]
206
- embeddings = model.encode(sentences)
207
- print(embeddings.shape)
208
- # [3, 768]
209
-
210
- # Get the similarity scores for the embeddings
211
- similarities = model.similarity(embeddings, embeddings)
212
- print(similarities.shape)
213
- # [3, 3]
214
- ```
215
-
216
- <!--
217
- ### Direct Usage (Transformers)
218
-
219
- <details><summary>Click to see the direct usage in Transformers</summary>
220
-
221
- </details>
222
- -->
223
-
224
- <!--
225
- ### Downstream Usage (Sentence Transformers)
226
-
227
- You can finetune this model on your own dataset.
228
-
229
- <details><summary>Click to expand</summary>
230
-
231
- </details>
232
- -->
233
 
234
- <!--
235
- ### Out-of-Scope Use
236
 
237
- *List how the model may foreseeably be misused and address what users ought not to do with the model.*
238
- -->
239
 
240
- <!--
241
- ## Bias, Risks and Limitations
242
 
243
- *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
244
- -->
245
 
246
- <!--
247
- ### Recommendations
248
 
249
- *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
250
- -->
251
 
252
- ## Training Details
 
253
 
254
- ### Training Dataset
 
 
 
 
 
255
 
256
- #### Unnamed Dataset
257
 
258
- * Size: 2,732,400 training samples
259
- * Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>label</code>
260
- * Approximate statistics based on the first 1000 samples:
261
- | | sentence_0 | sentence_1 | label |
262
- |:--------|:--------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|:--------------------------------------------------------------|
263
- | type | string | string | float |
264
- | details | <ul><li>min: 13 tokens</li><li>mean: 109.22 tokens</li><li>max: 1024 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 250.01 tokens</li><li>max: 1024 tokens</li></ul> | <ul><li>min: 1.0</li><li>mean: 1.0</li><li>max: 1.0</li></ul> |
265
- * Samples:
266
- | sentence_0 | sentence_1 | label |
267
- |:----------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------|
268
- | <code>Prints the specified `pkg`.<br><br>If `is_main` is not set, nested package notation is used.</code> | <code>pub fn print_package(<br> &mut self,<br> resolve: &Resolve,<br> pkg: PackageId,<br> is_main: bool,<br> ) -> Result<()> {<br> let pkg = &resolve.packages[pkg];<br> self.print_package_outer(pkg)?;<br><br> if is_main {<br> self.output.semicolon();<br> self.output.newline();<br> } else {<br> self.output.indent_start();<br> }<br><br> for (name, id) in pkg.interfaces.iter() {<br> self.print_interface_outer(resolve, *id, name)?;<br> self.output.indent_start();<br> self.print_interface(resolve, *id)?;<br> self.output.indent_end();<br> if is_main {<br> self.output.newline();<br> }<br> }<br><br> for (name, id) in pkg.worlds.iter() {<br> self.print_docs(&resolve.worlds[*id].docs);<br> self.print_stability(&resolve.worlds[*id].stability);<br> self.output.keyword("world");<br> self.output.str(" ");<br> self.print_name_type(name, TypeKind:...</code> | <code>1.0</code> |
269
- | <code><p>An alternative descriptive name for the user.</p></code> | <code>pub fn nick_name(mut self, input: impl ::std::convert::Into<::std::string::String>) -> Self {<br> self.nick_name = ::std::option::Option::Some(input.into());<br> self<br> }</code> | <code>1.0</code> |
270
- | <code><p>Indicates whether the match is case sensitive.</p></code> | <code>pub fn case_sensitive(mut self, input: bool) -> Self {<br> self.case_sensitive = ::std::option::Option::Some(input);<br> self<br> }</code> | <code>1.0</code> |
271
- * Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
272
- ```json
273
- {
274
- "scale": 20.0,
275
- "similarity_fct": "cos_sim"
276
- }
277
- ```
278
 
279
- ### Training Hyperparameters
280
- #### Non-Default Hyperparameters
281
 
282
- - `per_device_train_batch_size`: 150
283
- - `per_device_eval_batch_size`: 150
284
- - `fp16`: True
285
- - `multi_dataset_batch_sampler`: round_robin
286
 
287
- #### All Hyperparameters
288
- <details><summary>Click to expand</summary>
289
 
290
- - `overwrite_output_dir`: False
291
- - `do_predict`: False
292
- - `eval_strategy`: no
293
- - `prediction_loss_only`: True
294
- - `per_device_train_batch_size`: 150
295
- - `per_device_eval_batch_size`: 150
296
- - `per_gpu_train_batch_size`: None
297
- - `per_gpu_eval_batch_size`: None
298
- - `gradient_accumulation_steps`: 1
299
- - `eval_accumulation_steps`: None
300
- - `torch_empty_cache_steps`: None
301
- - `learning_rate`: 5e-05
302
- - `weight_decay`: 0.0
303
- - `adam_beta1`: 0.9
304
- - `adam_beta2`: 0.999
305
- - `adam_epsilon`: 1e-08
306
- - `max_grad_norm`: 1
307
- - `num_train_epochs`: 3
308
- - `max_steps`: -1
309
- - `lr_scheduler_type`: linear
310
- - `lr_scheduler_kwargs`: {}
311
- - `warmup_ratio`: 0.0
312
- - `warmup_steps`: 0
313
- - `log_level`: passive
314
- - `log_level_replica`: warning
315
- - `log_on_each_node`: True
316
- - `logging_nan_inf_filter`: True
317
- - `save_safetensors`: True
318
- - `save_on_each_node`: False
319
- - `save_only_model`: False
320
- - `restore_callback_states_from_checkpoint`: False
321
- - `no_cuda`: False
322
- - `use_cpu`: False
323
- - `use_mps_device`: False
324
- - `seed`: 42
325
- - `data_seed`: None
326
- - `jit_mode_eval`: False
327
- - `use_ipex`: False
328
- - `bf16`: False
329
- - `fp16`: True
330
- - `fp16_opt_level`: O1
331
- - `half_precision_backend`: auto
332
- - `bf16_full_eval`: False
333
- - `fp16_full_eval`: False
334
- - `tf32`: None
335
- - `local_rank`: 0
336
- - `ddp_backend`: None
337
- - `tpu_num_cores`: None
338
- - `tpu_metrics_debug`: False
339
- - `debug`: []
340
- - `dataloader_drop_last`: False
341
- - `dataloader_num_workers`: 0
342
- - `dataloader_prefetch_factor`: None
343
- - `past_index`: -1
344
- - `disable_tqdm`: False
345
- - `remove_unused_columns`: True
346
- - `label_names`: None
347
- - `load_best_model_at_end`: False
348
- - `ignore_data_skip`: False
349
- - `fsdp`: []
350
- - `fsdp_min_num_params`: 0
351
- - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
352
- - `fsdp_transformer_layer_cls_to_wrap`: None
353
- - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
354
- - `deepspeed`: None
355
- - `label_smoothing_factor`: 0.0
356
- - `optim`: adamw_torch
357
- - `optim_args`: None
358
- - `adafactor`: False
359
- - `group_by_length`: False
360
- - `length_column_name`: length
361
- - `ddp_find_unused_parameters`: None
362
- - `ddp_bucket_cap_mb`: None
363
- - `ddp_broadcast_buffers`: False
364
- - `dataloader_pin_memory`: True
365
- - `dataloader_persistent_workers`: False
366
- - `skip_memory_metrics`: True
367
- - `use_legacy_prediction_loop`: False
368
- - `push_to_hub`: False
369
- - `resume_from_checkpoint`: None
370
- - `hub_model_id`: None
371
- - `hub_strategy`: every_save
372
- - `hub_private_repo`: None
373
- - `hub_always_push`: False
374
- - `gradient_checkpointing`: False
375
- - `gradient_checkpointing_kwargs`: None
376
- - `include_inputs_for_metrics`: False
377
- - `include_for_metrics`: []
378
- - `eval_do_concat_batches`: True
379
- - `fp16_backend`: auto
380
- - `push_to_hub_model_id`: None
381
- - `push_to_hub_organization`: None
382
- - `mp_parameters`:
383
- - `auto_find_batch_size`: False
384
- - `full_determinism`: False
385
- - `torchdynamo`: None
386
- - `ray_scope`: last
387
- - `ddp_timeout`: 1800
388
- - `torch_compile`: False
389
- - `torch_compile_backend`: None
390
- - `torch_compile_mode`: None
391
- - `include_tokens_per_second`: False
392
- - `include_num_input_tokens_seen`: False
393
- - `neftune_noise_alpha`: None
394
- - `optim_target_modules`: None
395
- - `batch_eval_metrics`: False
396
- - `eval_on_start`: False
397
- - `use_liger_kernel`: False
398
- - `eval_use_gather_object`: False
399
- - `average_tokens_across_devices`: False
400
- - `prompts`: None
401
- - `batch_sampler`: batch_sampler
402
- - `multi_dataset_batch_sampler`: round_robin
403
 
404
- </details>
405
 
406
- ### Training Logs
407
- <details><summary>Click to expand</summary>
408
 
409
- | Epoch | Step | Training Loss |
410
- |:------:|:-----:|:-------------:|
411
- | 0.0274 | 500 | 0.8232 |
412
- | 0.0549 | 1000 | 0.1248 |
413
- | 0.0823 | 1500 | 0.1102 |
414
- | 0.1098 | 2000 | 0.1008 |
415
- | 0.1372 | 2500 | 0.0962 |
416
- | 0.1647 | 3000 | 0.0928 |
417
- | 0.1921 | 3500 | 0.0878 |
418
- | 0.2196 | 4000 | 0.0827 |
419
- | 0.2470 | 4500 | 0.078 |
420
- | 0.2745 | 5000 | 0.0763 |
421
- | 0.3019 | 5500 | 0.075 |
422
- | 0.3294 | 6000 | 0.0716 |
423
- | 0.3568 | 6500 | 0.0691 |
424
- | 0.3843 | 7000 | 0.0673 |
425
- | 0.4117 | 7500 | 0.065 |
426
- | 0.4392 | 8000 | 0.0668 |
427
- | 0.4666 | 8500 | 0.0609 |
428
- | 0.4941 | 9000 | 0.0613 |
429
- | 0.5215 | 9500 | 0.0596 |
430
- | 0.5490 | 10000 | 0.0596 |
431
- | 0.5764 | 10500 | 0.058 |
432
- | 0.6039 | 11000 | 0.0527 |
433
- | 0.6313 | 11500 | 0.0521 |
434
- | 0.6588 | 12000 | 0.0521 |
435
- | 0.6862 | 12500 | 0.049 |
436
- | 0.7137 | 13000 | 0.0481 |
437
- | 0.7411 | 13500 | 0.0484 |
438
- | 0.7686 | 14000 | 0.049 |
439
- | 0.7960 | 14500 | 0.0482 |
440
- | 0.8235 | 15000 | 0.045 |
441
- | 0.8509 | 15500 | 0.0423 |
442
- | 0.8783 | 16000 | 0.0425 |
443
- | 0.9058 | 16500 | 0.04 |
444
- | 0.9332 | 17000 | 0.0406 |
445
- | 0.9607 | 17500 | 0.0374 |
446
- | 0.9881 | 18000 | 0.038 |
447
- | 1.0156 | 18500 | 0.0257 |
448
- | 1.0430 | 19000 | 0.0154 |
449
- | 1.0705 | 19500 | 0.015 |
450
- | 1.0979 | 20000 | 0.0157 |
451
- | 1.1254 | 20500 | 0.0144 |
452
- | 1.1528 | 21000 | 0.0148 |
453
- | 1.1803 | 21500 | 0.0152 |
454
- | 1.2077 | 22000 | 0.0154 |
455
- | 1.2352 | 22500 | 0.0161 |
456
- | 1.2626 | 23000 | 0.0155 |
457
- | 1.2901 | 23500 | 0.0148 |
458
- | 1.3175 | 24000 | 0.0152 |
459
- | 1.3450 | 24500 | 0.015 |
460
- | 1.3724 | 25000 | 0.0148 |
461
- | 1.3999 | 25500 | 0.0151 |
462
- | 1.4273 | 26000 | 0.0144 |
463
- | 1.4548 | 26500 | 0.0147 |
464
- | 1.4822 | 27000 | 0.0143 |
465
- | 1.5097 | 27500 | 0.0148 |
466
- | 1.5371 | 28000 | 0.0147 |
467
- | 1.5646 | 28500 | 0.0145 |
468
- | 1.5920 | 29000 | 0.0137 |
469
- | 1.6195 | 29500 | 0.0134 |
470
- | 1.6469 | 30000 | 0.0137 |
471
- | 1.6744 | 30500 | 0.0133 |
472
- | 1.7018 | 31000 | 0.0137 |
473
- | 1.7292 | 31500 | 0.0132 |
474
- | 1.7567 | 32000 | 0.0132 |
475
- | 1.7841 | 32500 | 0.0124 |
476
- | 1.8116 | 33000 | 0.0133 |
477
- | 1.8390 | 33500 | 0.0118 |
478
- | 1.8665 | 34000 | 0.0122 |
479
- | 1.8939 | 34500 | 0.0114 |
480
- | 1.9214 | 35000 | 0.0116 |
481
- | 1.9488 | 35500 | 0.0113 |
482
- | 1.9763 | 36000 | 0.0115 |
483
- | 2.0037 | 36500 | 0.0105 |
484
- | 2.0312 | 37000 | 0.0056 |
485
- | 2.0586 | 37500 | 0.0056 |
486
- | 2.0861 | 38000 | 0.0051 |
487
- | 2.1135 | 38500 | 0.0053 |
488
- | 2.1410 | 39000 | 0.0054 |
489
- | 2.1684 | 39500 | 0.0052 |
490
- | 2.1959 | 40000 | 0.0053 |
491
- | 2.2233 | 40500 | 0.0054 |
492
- | 2.2508 | 41000 | 0.0051 |
493
- | 2.2782 | 41500 | 0.0052 |
494
- | 2.3057 | 42000 | 0.0052 |
495
- | 2.3331 | 42500 | 0.0046 |
496
- | 2.3606 | 43000 | 0.0048 |
497
- | 2.3880 | 43500 | 0.0051 |
498
- | 2.4155 | 44000 | 0.0049 |
499
- | 2.4429 | 44500 | 0.0047 |
500
- | 2.4704 | 45000 | 0.0047 |
501
- | 2.4978 | 45500 | 0.0048 |
502
- | 2.5253 | 46000 | 0.005 |
503
- | 2.5527 | 46500 | 0.0049 |
504
- | 2.5801 | 47000 | 0.0047 |
505
- | 2.6076 | 47500 | 0.0046 |
506
- | 2.6350 | 48000 | 0.0048 |
507
- | 2.6625 | 48500 | 0.0045 |
508
- | 2.6899 | 49000 | 0.0043 |
509
- | 2.7174 | 49500 | 0.0047 |
510
- | 2.7448 | 50000 | 0.0045 |
511
- | 2.7723 | 50500 | 0.0046 |
512
- | 2.7997 | 51000 | 0.0046 |
513
- | 2.8272 | 51500 | 0.0044 |
514
- | 2.8546 | 52000 | 0.0042 |
515
- | 2.8821 | 52500 | 0.0045 |
516
- | 2.9095 | 53000 | 0.0045 |
517
- | 2.9370 | 53500 | 0.0043 |
518
- | 2.9644 | 54000 | 0.0044 |
519
- | 2.9919 | 54500 | 0.0043 |
520
 
521
- </details>
522
 
523
- ### Framework Versions
524
- - Python: 3.11.12
525
- - Sentence Transformers: 4.1.0
526
- - Transformers: 4.52.3
527
- - PyTorch: 2.6.0+cu124
528
- - Accelerate: 1.6.0
529
- - Datasets: 3.6.0
530
- - Tokenizers: 0.21.1
531
 
532
- ## Citation
 
533
 
534
- ### BibTeX
 
535
 
536
- #### Sentence Transformers
537
- ```bibtex
538
- @inproceedings{reimers-2019-sentence-bert,
539
- title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
540
- author = "Reimers, Nils and Gurevych, Iryna",
541
- booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
542
- month = "11",
543
- year = "2019",
544
- publisher = "Association for Computational Linguistics",
545
- url = "https://arxiv.org/abs/1908.10084",
546
- }
547
- ```
548
 
549
- #### MultipleNegativesRankingLoss
550
- ```bibtex
551
- @misc{henderson2017efficient,
552
- title={Efficient Natural Language Response Suggestion for Smart Reply},
553
- author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
554
- year={2017},
555
- eprint={1705.00652},
556
- archivePrefix={arXiv},
557
- primaryClass={cs.CL}
558
- }
559
- ```
560
 
561
- <!--
562
- ## Glossary
563
 
564
- *Clearly define terms in order to be accessible across audiences.*
565
- -->
 
 
 
 
 
566
 
567
- <!--
568
- ## Model Card Authors
569
 
570
- *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
571
- -->
 
572
 
573
- <!--
574
- ## Model Card Contact
575
 
576
- *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
577
- -->
 
 
1
  ---
2
  tags:
3
+ - code
4
+ - python
5
+ - java
6
+ - javascript
7
+ - go
8
+ - ruby
9
+ - rust
10
+ - typescript
11
+ - php
12
+ - sentence-transformer
13
+ base_model:
14
+ - Shuu12121/CodeModernBERT-Owl-2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  pipeline_tag: sentence-similarity
16
  library_name: sentence-transformers
17
+ license: apache-2.0
18
+ language:
19
+ - en
20
+ datasets:
21
+ - Shuu12121/python-codesearch-dedupe-filtered
22
+ - Shuu12121/java-codesearch-dedupe-filtered
23
+ - Shuu12121/javascript-codesearch-dedupe-filtered
24
+ - Shuu12121/typescipt-codesearch-dedupe-filtered
25
+ - Shuu12121/go-codesearch-dedupe-filtered
26
+ - Shuu12121/ruby-codesearch-dedupe-filtered
27
+ - Shuu12121/rust-codesearch-dedupe-filtered
28
+ - Shuu12121/php-codesearch-dedupe-filtered
29
 
30
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # 🦉 CodeSearch-ModernBERT-Owl-2.0-Plus
 
33
 
34
+ ## 日本語版(Japanese)
 
35
 
36
+ **Shuu12121/CodeSearch-ModernBERT-Owl-2.0-Plus** は、マルチリンガルなコード理解・検索のために設計された **CodeModernBERT-Owl** 系列の最新事前学習モデルです。
37
+ 本モデルは、VSCode拡張機能 [**OwlSpotlight**](https://marketplace.visualstudio.com/items?itemName=Shun0212.owlspotlight) にて使用されており、関数レベルの意味的コード検索を実現します。
38
 
39
+ ### 🔧 特徴
 
40
 
41
+ - **独自コーパスで事前学習**
42
+ CodeBERT (Feng et al., 2020) の約4倍の規模となる、完全独自収集の高品質なコード・docstringコーパスを用いて事前学習。
43
 
44
+ - **8言語対応**
45
+ Python, Java, JavaScript, PHP, Ruby, Go, Rust に加えて、**TypeScript** を新たにサポート。
46
 
47
+ - **長文対応(最大8192トークン)**
48
+ 訓練時最大2048トークン、推論時には8192トークンまでの入力を処理可能(Position Embedding拡張済み)。
49
 
50
+ - **徹底したノイズ除去・データクリーニング**
51
+ - Tree-sitter による関数・docstring抽出
52
+ - 無意味な定型コメント・多言語ノイズの除去
53
+ - シークレット・APIキーの自動マスキング
54
+ - ライセンス記述の除外
55
+ - 重複関数の除去によるリーク対策
56
 
57
+ ### 📦 基本情報
58
 
59
+ | 項目 | 内容 |
60
+ |------|------|
61
+ | モデル名 | Shuu12121/CodeSearch-ModernBERT-Owl-2.0-Plus |
62
+ | モデルサイズ | 約150Mパラメータ(ModernBERTベース) |
63
+ | 対応言語 | Python, Java, JavaScript, PHP, Ruby, Go, Rust, TypeScript |
64
+ | 最大トークン長 | 学習時: 2048 / 推論時: 8192 |
65
+ | トークナイザ | 独自BPE(52,000語彙) |
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ ### 🚀 主な用途
 
68
 
69
+ - 関数レベルの意味的コード検索(自然言語 → 関数コード)
70
+ - コード補完・要約・分類・クローン検出などの下流タスク
71
+ - Retrieval-Augmented Generation(RAG)における高精度なコード検索
 
72
 
73
+ ### 🧪 利用例:VSCode拡張「[OwlSpotlight](https://github.com/Shun0212/OwlSpotLight)」
 
74
 
75
+ 本モデルは、[OwlSpotlight](https://marketplace.visualstudio.com/items?itemName=Shun0212.owlspotlight) に組み込まれており、自然言語による直感的な関数検索が可能です。
76
+ 🖥 Mac(Mシリーズ含む)上で軽量に動作し、開発中のコードベースに即座にインデックス作成・検索できます。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ ---
79
 
80
+ ## English Version
 
81
 
82
+ **Shuu12121/CodeSearch-ModernBERT-Owl-2.0-Plus** is the latest pretrained model in the multilingual **CodeModernBERT-Owl** series, designed for high-quality code understanding and semantic retrieval.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ It powers the [**OwlSpotlight**](https://marketplace.visualstudio.com/items?itemName=Shun0212.owlspotlight) extension for VSCode, enabling fast and intuitive natural language to code search at the function level.
85
 
86
+ ### 🔧 Highlights
 
 
 
 
 
 
 
87
 
88
+ - **Pretrained on a custom large-scale corpus**
89
+ The training corpus is ~4x larger than CodeBERT’s bimodal dataset and built entirely from scratch, ensuring high-quality code and documentation pairs.
90
 
91
+ - **Supports 8 programming languages**
92
+ Python, Java, JavaScript, PHP, Ruby, Go, Rust, and newly **TypeScript**.
93
 
94
+ - **Long-sequence input support**
95
+ Trained on sequences up to 2048 tokens, and extended to handle 8192 tokens at inference.
 
 
 
 
 
 
 
 
 
 
96
 
97
+ - **Robust data cleaning & filtering**
98
+ - Tree-sitter-based function/docstring extraction
99
+ - Removal of templated or non-English comments
100
+ - API key and secret masking
101
+ - License-related content exclusion
102
+ - Deduplication for data leakage prevention
 
 
 
 
 
103
 
104
+ ### 📦 Model Specs
 
105
 
106
+ | Item | Detail |
107
+ |------|--------|
108
+ | Name | Shuu12121/CodeSearch-ModernBERT-Owl-2.0-Plus |
109
+ | Size | ~150M parameters (ModernBERT backbone) |
110
+ | Supported Languages | Python, Java, JavaScript, PHP, Ruby, Go, Rust, TypeScript |
111
+ | Max Token Length | 2048 (train), 8192 (inference) |
112
+ | Tokenizer | Custom BPE tokenizer (52k vocab) |
113
 
114
+ ### 🚀 Use Cases
 
115
 
116
+ - Function-level semantic code search (natural language code)
117
+ - Code completion, summarization, classification, and clone detection
118
+ - Retrieval for RAG systems
119
 
120
+ ### 🧪 Real-World Use: [OwlSpotlight](https://github.com/Shun0212/OwlSpotLight)
 
121
 
122
+ This model is used in the [OwlSpotlight](https://marketplace.visualstudio.com/items?itemName=Shun0212.owlspotlight) VSCode extension.
123
+ Search through your Python codebase using plain English and jump instantly to relevant functions with semantic understanding.
124
+ Tested and optimized for macOS (including Apple Silicon).