Spaces:
Running
on
Zero
Running
on
Zero
Update audio_tokenizer.py
Browse files- audio_tokenizer.py +48 -25
audio_tokenizer.py
CHANGED
|
@@ -124,31 +124,54 @@ def process_dataset(
|
|
| 124 |
text_field: Name of text field in dataset (default: "text_scribe")
|
| 125 |
target_sample_rate: Target audio sample rate (default: 24000)
|
| 126 |
"""
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
# Download dataset
|
| 154 |
print(f"Downloading dataset: {original_dataset}")
|
|
|
|
| 124 |
text_field: Name of text field in dataset (default: "text_scribe")
|
| 125 |
target_sample_rate: Target audio sample rate (default: 24000)
|
| 126 |
"""
|
| 127 |
+
try:
|
| 128 |
+
# Set tokenizer and config based on model type
|
| 129 |
+
print(f"Setting up configuration for model_type: {model_type}")
|
| 130 |
+
if model_type == "qwen3":
|
| 131 |
+
tokenizer_model = "Qwen/Qwen3-0.6B"
|
| 132 |
+
config_path = "qwen3.yaml"
|
| 133 |
+
elif model_type == "lfm2":
|
| 134 |
+
tokenizer_model = "LiquidAI/LFM2-350M"
|
| 135 |
+
config_path = "lfm2.yaml"
|
| 136 |
+
else:
|
| 137 |
+
raise ValueError(f"Invalid model_type: {model_type}. Must be 'qwen3' or 'lfm2'")
|
| 138 |
+
|
| 139 |
+
print(f"Tokenizer Model: {tokenizer_model}")
|
| 140 |
+
print(f"Config Path: {config_path}")
|
| 141 |
+
|
| 142 |
+
# Load configuration
|
| 143 |
+
print(f"Loading config from: {config_path}")
|
| 144 |
+
config = load_config(config_path)
|
| 145 |
+
print(f"Config loaded successfully. Type: {type(config)}")
|
| 146 |
+
|
| 147 |
+
if not isinstance(config, dict):
|
| 148 |
+
raise TypeError(f"Config must be a dictionary, got {type(config)}")
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"Error in initial setup: {str(e)}")
|
| 152 |
+
raise
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
print("Extracting config values...")
|
| 156 |
+
TOKENIZER_LENGTH = config['TOKENIZER_LENGTH']
|
| 157 |
+
START_OF_TEXT = config['START_OF_TEXT']
|
| 158 |
+
END_OF_TEXT = config['END_OF_TEXT']
|
| 159 |
+
START_OF_SPEECH = config['START_OF_SPEECH']
|
| 160 |
+
END_OF_SPEECH = config['END_OF_SPEECH']
|
| 161 |
+
START_OF_HUMAN = config['START_OF_HUMAN']
|
| 162 |
+
END_OF_HUMAN = config['END_OF_HUMAN']
|
| 163 |
+
START_OF_AI = config['START_OF_AI']
|
| 164 |
+
END_OF_AI = config['END_OF_AI']
|
| 165 |
+
PAD_TOKEN = config['PAD_TOKEN']
|
| 166 |
+
AUDIO_TOKENS_START = config['AUDIO_TOKENS_START']
|
| 167 |
+
print("✓ All config values extracted successfully")
|
| 168 |
+
except KeyError as e:
|
| 169 |
+
print(f"Missing key in config: {e}")
|
| 170 |
+
print(f"Available keys: {list(config.keys())}")
|
| 171 |
+
raise
|
| 172 |
+
except Exception as e:
|
| 173 |
+
print(f"Error extracting config values: {str(e)}")
|
| 174 |
+
raise
|
| 175 |
|
| 176 |
# Download dataset
|
| 177 |
print(f"Downloading dataset: {original_dataset}")
|