Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	| import numpy as np | |
| import librosa | |
| import soundfile as sf | |
| from datasets import load_dataset | |
| from transformers import pipeline | |
| # Initialize pipelines for speech recognition and tts models | |
| asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en") | |
| narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
| # Speech-to-Text Function | |
| def transcribe_speech(filepath): | |
| if filepath is None: | |
| return "No audio found. Please retry." | |
| output = asr(filepath) | |
| return output["text"] | |
| # Long-Form Audio Transcription | |
| def transcribe_long_form(filepath): | |
| if filepath is None: | |
| return "No audio found. Please retry." | |
| # Load and preprocess audio | |
| audio, sampling_rate = sf.read(filepath) | |
| audio_transposed = np.transpose(audio) | |
| audio_mono = librosa.to_mono(audio_transposed) | |
| audio_16KHz = librosa.resample(audio_mono, orig_sr=sampling_rate, target_sr=16000) | |
| # Transcribe using ASR pipeline | |
| chunks = asr(audio_16KHz, chunk_length_s=30, batch_size=4, return_timestamps=True)["chunks"] | |
| # Combine all transcriptions | |
| return "\n".join([chunk["text"] for chunk in chunks]) | |
| # Text-to-Speech Function | |
| def text_to_speech(text): | |
| if not text.strip(): | |
| return "No text provided. Please enter text to synthesize." | |
| narrated_text = narrator(text) | |
| audio_array = narrated_text["audio"][0].flatten() # Flatten the 2D array to 1D | |
| sampling_rate = narrated_text["sampling_rate"] # Get sampling rate | |
| return sampling_rate, audio_array | |
| # Sample Dataset Access Function | |
| def get_dataset_sample(idx): | |
| dataset = load_dataset("librispeech_asr", split="train.clean.100", streaming=True, trust_remote_code=True) | |
| # example = next(iter(dataset)) | |
| dataset_head = list(dataset.take(5)) | |
| sample = dataset_head[idx] | |
| audio_array = sample["audio"]["array"] | |
| sampling_rate = sample["audio"]["sampling_rate"] | |
| transcription = sample["text"] | |
| return (audio_array, sampling_rate), transcription | |