Spaces:
Sleeping
Sleeping
File size: 1,840 Bytes
4f198be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import streamlit as st
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig, AutoConfig
import requests
import io
# Initialize the processor and model outside the function
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
config_url = "https://raw.githubusercontent.com/yash-412/Yash-Projects/main/marathi-enhanced/config.json" # Specify the path to your config.json file
config_response = requests.get(config_url)
config_dict = config_response.json() if config_response.status_code == 200 else None
config = WhisperConfig.from_dict(config_dict) if config_dict else None
model = WhisperForConditionalGeneration.from_pretrained("yash-412/whisper-marathi", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")
def get_transcription(speech):
# Process audio using the Whisper processor
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
# Generate transcription using the Whisper model
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription
def main():
st.title("Marathi Enhanced-Whisper Transcription")
st.write("Upload an audio file")
uploaded_file = st.file_uploader("Choose an audio file", type=["mp3", "wav"])
if uploaded_file:
st.audio(uploaded_file, format='audio/wav', start_time=0)
audio_bytes = uploaded_file.read()
speech, _ = librosa.load(io.BytesIO(audio_bytes), sr=16000)
if st.button("Transcribe"):
transcription = get_transcription(speech)
st.subheader("Transcription:")
st.write(transcription[0])
if __name__ == "__main__":
main() |