Spaces:
Sleeping
Sleeping
| from transformers import WhisperTokenizer | |
| import os | |
| tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe" | |
| from transformers import pipeline | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", | |
| task="automatic-speech-recognition", | |
| tokenizer= tokenizer, | |
| ) # change to "your-username/the-name-you-picked" | |
| # pipe.model.config.forced_decoder_ids = ( | |
| # pipe.tokenizer.get_decoder_prompt_ids( | |
| # language="marathi", task="transcribe" | |
| # ) | |
| # ) | |
| # def transcribe_speech(filepath): | |
| # # waveform, sample_rate = torchaudio.load(filepath) | |
| # # Resample the audio signal to 16k sampling rate | |
| # # resampler = torchaudio.transforms.Resample(sample_rate, 16000) | |
| # # waveform_16k = resampler(waveform) | |
| # # Save the resampled audio signal to a new file | |
| # # torchaudio.save(filepath, waveform_16k, 16000) | |
| # output = pipe( | |
| # filepath, | |
| # max_new_tokens=3, | |
| # generate_kwargs={ | |
| # "task": "transcribe", | |
| # # "language": "konkani", | |
| # }, # update with the language you've fine-tuned on | |
| # chunk_length_s=30, | |
| # batch_size=8, | |
| # # sampling_rate=16000, | |
| # # padding=True | |
| # ) | |
| # print(output) | |
| # return output["text"] | |
| def transcribe_speech(filepath): | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| import torch | |
| import librosa | |
| # Load model and processor | |
| model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3") | |
| tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") | |
| processor = WhisperProcessor.from_pretrained("openai/whisper-small") | |
| output = "" | |
| # Load and preprocess audio | |
| audio_path = filepath | |
| audio, sr = librosa.load(audio_path, sr=16000) | |
| input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features | |
| # Check length and process | |
| if input_features.shape[-1] > 3000: | |
| print("Splitting audio required") | |
| # from pydub import AudioSegment | |
| # def split_audio(file_path, chunk_length_ms=30000): # 30 sec chunks | |
| # audio = AudioSegment.from_file(file_path) | |
| # chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] | |
| # return chunks | |
| # # Split and transcribe | |
| # audio_chunks = split_audio(audio_path) | |
| # for i, chunk in enumerate(audio_chunks): | |
| # print(i) | |
| # chunk.export(f"chunk_{i}.wav", format="wav") | |
| # result = pipe(f"chunk_{i}.wav") | |
| # output += result['text'] + " " | |
| # print(f"Chunk {i}: {result['text']}") | |
| else: | |
| predicted_ids = model.generate(input_features) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |
| output = transcription | |
| print(transcription) | |
| return output #output["text"] | |
| demo = gr.Blocks() | |
| mic_transcribe = gr.Interface( | |
| fn=transcribe_speech, | |
| inputs=gr.Audio(sources="microphone", type="filepath"), | |
| outputs=gr.components.Textbox(), | |
| ) | |
| file_transcribe = gr.Interface( | |
| fn=transcribe_speech, | |
| inputs=gr.Audio(sources="upload", type="filepath"), | |
| outputs=gr.components.Textbox(), | |
| examples=[ | |
| [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")], | |
| [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")], | |
| [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")], | |
| [os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")], | |
| ], | |
| ) | |
| with demo: | |
| gr.TabbedInterface( | |
| [mic_transcribe, file_transcribe], | |
| ["Transcribe Microphone", "Transcribe Audio File"], | |
| ) | |
| demo.launch(debug=True) | |
| # # def transcribe(audio): | |
| # # # text = pipe(audio)["text"] | |
| # # # pipe(audio) | |
| # # text = pipe(audio) | |
| # # print("op",text) | |
| # # return text#pipe(audio) #text | |
| # # iface = gr.Interface( | |
| # # fn=transcribe, | |
| # # inputs=[gr.Audio(sources=["microphone", "upload"])], | |
| # # outputs="text", | |
| # # examples=[ | |
| # # [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")], | |
| # # [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")], | |
| # # [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")], | |
| # # ], | |
| # # title="Whisper Konkani", | |
| # # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", | |
| # # ) | |
| # # iface.launch() | |
| # from transformers import WhisperTokenizer, pipeline | |
| # import gradio as gr | |
| # import os | |
| # tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe") | |
| # pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer) | |
| # def transcribe(audio): | |
| # result = pipe(audio) | |
| # text = result[0]['text'] | |
| # print("op", text) | |
| # return text | |
| # iface = gr.Interface( | |
| # fn=transcribe, | |
| # inputs=[gr.Audio(sources=["microphone", "upload"])], | |
| # outputs="text", | |
| # examples=[ | |
| # [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")], | |
| # [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")], | |
| # [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")], | |
| # ], | |
| # title="Whisper Konkani", | |
| # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", | |
| # ) | |
| # iface.launch() |