Fix model ID and update dataset IDs to minishlab convention
Browse files
README.md
CHANGED
|
@@ -30,7 +30,7 @@ pip install model2vec
|
|
| 30 |
```python
|
| 31 |
from model2vec import StaticModel
|
| 32 |
|
| 33 |
-
model = StaticModel.from_pretrained("
|
| 34 |
|
| 35 |
# Embed natural language queries
|
| 36 |
query_embeddings = model.encode(["How to read a file in Python?"])
|
|
@@ -84,7 +84,7 @@ CoIR covers a broad range of code retrieval scenarios. For the use case of findi
|
|
| 84 |
|
| 85 |
## Reproducibility
|
| 86 |
|
| 87 |
-
The following script reproduces this model end-to-end. It requires the tokenlearn training data from `
|
| 88 |
|
| 89 |
```python
|
| 90 |
"""Reproduction script for potion-code-16M.
|
|
@@ -144,8 +144,8 @@ PCA_DIMS = 256
|
|
| 144 |
SIF_COEFFICIENT = 1e-4
|
| 145 |
|
| 146 |
# Tokenlearn
|
| 147 |
-
TOKENLEARN_DOCS_DATASET = "
|
| 148 |
-
TOKENLEARN_QUERIES_DATASET = "
|
| 149 |
TOKENLEARN_LANGUAGES = ["go", "java", "javascript", "php", "python", "ruby"]
|
| 150 |
TOKENLEARN_MAX_PER_LANGUAGE = 20_000 # 20k docs + 20k queries × 6 langs = 240k total
|
| 151 |
TOKENLEARN_LR = 1e-3
|
|
|
|
| 30 |
```python
|
| 31 |
from model2vec import StaticModel
|
| 32 |
|
| 33 |
+
model = StaticModel.from_pretrained("minishlab/potion-code-16M")
|
| 34 |
|
| 35 |
# Embed natural language queries
|
| 36 |
query_embeddings = model.encode(["How to read a file in Python?"])
|
|
|
|
| 84 |
|
| 85 |
## Reproducibility
|
| 86 |
|
| 87 |
+
The following script reproduces this model end-to-end. It requires the tokenlearn training data from `minishlab/tokenlearn-cornstack-docs-coderankembed` and `minishlab/tokenlearn-cornstack-queries-coderankembed` (20k samples per language used).
|
| 88 |
|
| 89 |
```python
|
| 90 |
"""Reproduction script for potion-code-16M.
|
|
|
|
| 144 |
SIF_COEFFICIENT = 1e-4
|
| 145 |
|
| 146 |
# Tokenlearn
|
| 147 |
+
TOKENLEARN_DOCS_DATASET = "minishlab/tokenlearn-cornstack-docs-coderankembed"
|
| 148 |
+
TOKENLEARN_QUERIES_DATASET = "minishlab/tokenlearn-cornstack-queries-coderankembed"
|
| 149 |
TOKENLEARN_LANGUAGES = ["go", "java", "javascript", "php", "python", "ruby"]
|
| 150 |
TOKENLEARN_MAX_PER_LANGUAGE = 20_000 # 20k docs + 20k queries × 6 langs = 240k total
|
| 151 |
TOKENLEARN_LR = 1e-3
|