Spaces:
Running
Running
Update database.py
Browse files- database.py +5 -4
database.py
CHANGED
@@ -11,8 +11,9 @@ import torch
|
|
11 |
# User-configurable variables
|
12 |
DB_NAME = "python_programs" # ChromaDB collection name
|
13 |
HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
|
14 |
-
|
15 |
PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
|
|
|
16 |
|
17 |
def init_chromadb(persist_dir=PERSIST_DIR):
|
18 |
"""Initialize ChromaDB client, optionally with persistent storage."""
|
@@ -161,7 +162,7 @@ def generate_description_tokens(sequence, vectors):
|
|
161 |
tokens.append(f"span:{vec[3]:.2f}")
|
162 |
return tokens
|
163 |
|
164 |
-
def generate_semantic_vector(description, use_gpu=
|
165 |
"""Generate a semantic vector for a textual description using CodeBERT, with CPU/GPU option."""
|
166 |
# Load CodeBERT model and tokenizer
|
167 |
model_name = "microsoft/codebert-base"
|
@@ -186,7 +187,7 @@ def generate_semantic_vector(description, use_gpu=False):
|
|
186 |
vector = vector[:6]
|
187 |
return vector
|
188 |
|
189 |
-
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=
|
190 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
191 |
client = init_chromadb()
|
192 |
collection = create_collection(client)
|
@@ -207,7 +208,7 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
|
207 |
dataset.push_to_hub(dataset_name, token=token)
|
208 |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
209 |
|
210 |
-
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=
|
211 |
"""Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
|
212 |
try:
|
213 |
dataset = load_dataset(dataset_name, split="train", token=token)
|
|
|
11 |
# User-configurable variables
|
12 |
DB_NAME = "python_programs" # ChromaDB collection name
|
13 |
HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
|
14 |
+
HF_KEY = "YOUR_HUGGINGFACE_TOKEN" # Replace with your Hugging Face API token
|
15 |
PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
|
16 |
+
USE_GPU = False # Default to CPU, set to True for GPU if available
|
17 |
|
18 |
def init_chromadb(persist_dir=PERSIST_DIR):
|
19 |
"""Initialize ChromaDB client, optionally with persistent storage."""
|
|
|
162 |
tokens.append(f"span:{vec[3]:.2f}")
|
163 |
return tokens
|
164 |
|
165 |
+
def generate_semantic_vector(description, use_gpu=USE_GPU):
|
166 |
"""Generate a semantic vector for a textual description using CodeBERT, with CPU/GPU option."""
|
167 |
# Load CodeBERT model and tokenizer
|
168 |
model_name = "microsoft/codebert-base"
|
|
|
187 |
vector = vector[:6]
|
188 |
return vector
|
189 |
|
190 |
+
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
|
191 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
192 |
client = init_chromadb()
|
193 |
collection = create_collection(client)
|
|
|
208 |
dataset.push_to_hub(dataset_name, token=token)
|
209 |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
210 |
|
211 |
+
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
|
212 |
"""Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
|
213 |
try:
|
214 |
dataset = load_dataset(dataset_name, split="train", token=token)
|