# import gradio as gr # import numpy as np # import torch # from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor # model_id = 'openai/whisper-large-v3' # device = "cuda:0" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device) # processor = AutoProcessor.from_pretrained(model_id) # pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True) # def transcribe_function(new_chunk, state): # try: # sr, y = new_chunk[0], new_chunk[1] # except TypeError: # print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}") # return state, "", None # y = y.astype(np.float32) / np.max(np.abs(y)) # if state is not None: # state = np.concatenate([state, y]) # else: # state = y # result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False) # full_text = result.get("text", "") # return state, full_text # with gr.Blocks() as demo: # gr.Markdown("# Voice to Text Transcription") # state = gr.State(None) # with gr.Row(): # with gr.Column(): # audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input") # with gr.Column(): # output_text = gr.Textbox(label="Transcription") # audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time") # demo.launch(show_error=True) # import gradio as gr # import numpy as np # import torch # from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor # model_id = 'openai/whisper-large-v3' # device = "cuda:0" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device) # processor = AutoProcessor.from_pretrained(model_id) # pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False) # def transcribe_function(new_chunk, state): # try: # sr, y = new_chunk # except TypeError: # print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}") # return state, "", None # y = y.astype(np.float32) / np.max(np.abs(y)) # if state is not None: # state = np.concatenate([state, y]) # else: # state = y # result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False) # full_text = result.get("text", "") # return state, full_text # with gr.Blocks() as demo: # gr.Markdown("# Voice to Text Transcription") # state = gr.State(None) # with gr.Row(): # with gr.Column(): # audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input") # with gr.Column(): # output_text = gr.Textbox(label="Transcription") # audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time") # demo.launch(show_error=True) # import gradio as gr # import numpy as np # import torch # from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor # model_id = 'openai/whisper-large-v3' # device = "cuda:0" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device) # processor = AutoProcessor.from_pretrained(model_id) # pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False) # def transcribe_function(new_chunk, state): # try: # sr, y = new_chunk # except TypeError: # print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}") # return state, "", None # y = y.astype(np.float32) / np.max(np.abs(y)) # if state is not None: # state = np.concatenate([state, y]) # else: # state = y # result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False) # full_text = result.get("text", "") # return state, full_text # with gr.Blocks() as demo: # gr.Markdown("# Voice to Text Transcription") # state = gr.State(None) # with gr.Row(): # with gr.Column(): # audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input") # with gr.Column(): # output_text = gr.Textbox(label="Transcription") # audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time") # demo.launch(show_error=True) import gradio as gr import numpy as np import torch from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor model_id = 'openai/whisper-large-v3' device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device) processor = AutoProcessor.from_pretrained(model_id) pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False) def ensure_mono(y): if len(y.shape) > 1 and y.shape[1] > 1: y = np.mean(y, axis=1) return y def transcribe_function(new_chunk, state): try: sr, y = new_chunk except TypeError: print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}") return state, "", None y = ensure_mono(y) y = y.astype(np.float32) / np.max(np.abs(y)) if state is not None: state = np.concatenate([state, y]) else: state = y result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False) full_text = result.get("text", "") return state, full_text def upload_transcribe(file): sr, y = file y = ensure_mono(y) y = y.astype(np.float32) / np.max(np.abs(y)) result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False) return result.get("text", "") with gr.Blocks() as demo: gr.Markdown("# Voice to Text Transcription") state = gr.State(None) with gr.Row(): with gr.Column(): audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input") audio_upload = gr.Audio(sources="upload", type='numpy', label="Upload Audio File") with gr.Column(): output_text = gr.Textbox(label="Transcription") upload_text = gr.Textbox(label="Uploaded Audio Transcription") audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time") audio_upload.change(upload_transcribe, inputs=audio_upload, outputs=upload_text) demo.launch(show_error=True)