kadirnar commited on
Commit
9b21e07
·
verified ·
1 Parent(s): 7d04939

Update audio_tokenizer.py

Browse files
Files changed (1) hide show
  1. audio_tokenizer.py +48 -25
audio_tokenizer.py CHANGED
@@ -124,31 +124,54 @@ def process_dataset(
124
  text_field: Name of text field in dataset (default: "text_scribe")
125
  target_sample_rate: Target audio sample rate (default: 24000)
126
  """
127
- # Set tokenizer and config based on model type
128
- if model_type == "qwen3":
129
- tokenizer_model = "Qwen/Qwen3-0.6B"
130
- config_path = "qwen3.yaml"
131
- elif model_type == "lfm2":
132
- tokenizer_model = "LiquidAI/LFM2-350M"
133
- config_path = "lfm2.yaml"
134
- else:
135
- raise ValueError(f"Invalid model_type: {model_type}. Must be 'qwen3' or 'lfm2'")
136
-
137
- # Load configuration
138
- print(f"Loading config from: {config_path}")
139
- config = load_config(config_path)
140
-
141
- TOKENIZER_LENGTH = config['TOKENIZER_LENGTH']
142
- START_OF_TEXT = config['START_OF_TEXT']
143
- END_OF_TEXT = config['END_OF_TEXT']
144
- START_OF_SPEECH = config['START_OF_SPEECH']
145
- END_OF_SPEECH = config['END_OF_SPEECH']
146
- START_OF_HUMAN = config['START_OF_HUMAN']
147
- END_OF_HUMAN = config['END_OF_HUMAN']
148
- START_OF_AI = config['START_OF_AI']
149
- END_OF_AI = config['END_OF_AI']
150
- PAD_TOKEN = config['PAD_TOKEN']
151
- AUDIO_TOKENS_START = config['AUDIO_TOKENS_START']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  # Download dataset
154
  print(f"Downloading dataset: {original_dataset}")
 
124
  text_field: Name of text field in dataset (default: "text_scribe")
125
  target_sample_rate: Target audio sample rate (default: 24000)
126
  """
127
+ try:
128
+ # Set tokenizer and config based on model type
129
+ print(f"Setting up configuration for model_type: {model_type}")
130
+ if model_type == "qwen3":
131
+ tokenizer_model = "Qwen/Qwen3-0.6B"
132
+ config_path = "qwen3.yaml"
133
+ elif model_type == "lfm2":
134
+ tokenizer_model = "LiquidAI/LFM2-350M"
135
+ config_path = "lfm2.yaml"
136
+ else:
137
+ raise ValueError(f"Invalid model_type: {model_type}. Must be 'qwen3' or 'lfm2'")
138
+
139
+ print(f"Tokenizer Model: {tokenizer_model}")
140
+ print(f"Config Path: {config_path}")
141
+
142
+ # Load configuration
143
+ print(f"Loading config from: {config_path}")
144
+ config = load_config(config_path)
145
+ print(f"Config loaded successfully. Type: {type(config)}")
146
+
147
+ if not isinstance(config, dict):
148
+ raise TypeError(f"Config must be a dictionary, got {type(config)}")
149
+
150
+ except Exception as e:
151
+ print(f"Error in initial setup: {str(e)}")
152
+ raise
153
+
154
+ try:
155
+ print("Extracting config values...")
156
+ TOKENIZER_LENGTH = config['TOKENIZER_LENGTH']
157
+ START_OF_TEXT = config['START_OF_TEXT']
158
+ END_OF_TEXT = config['END_OF_TEXT']
159
+ START_OF_SPEECH = config['START_OF_SPEECH']
160
+ END_OF_SPEECH = config['END_OF_SPEECH']
161
+ START_OF_HUMAN = config['START_OF_HUMAN']
162
+ END_OF_HUMAN = config['END_OF_HUMAN']
163
+ START_OF_AI = config['START_OF_AI']
164
+ END_OF_AI = config['END_OF_AI']
165
+ PAD_TOKEN = config['PAD_TOKEN']
166
+ AUDIO_TOKENS_START = config['AUDIO_TOKENS_START']
167
+ print("✓ All config values extracted successfully")
168
+ except KeyError as e:
169
+ print(f"Missing key in config: {e}")
170
+ print(f"Available keys: {list(config.keys())}")
171
+ raise
172
+ except Exception as e:
173
+ print(f"Error extracting config values: {str(e)}")
174
+ raise
175
 
176
  # Download dataset
177
  print(f"Downloading dataset: {original_dataset}")