phuntshowangdi commited on
Commit
717afb8
1 Parent(s): 60255c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -63
app.py CHANGED
@@ -1,66 +1,81 @@
1
- import streamlit as st
2
  from transformers import pipeline
3
- import numpy as np
4
- from io import BytesIO
5
- import torchaudio
6
-
7
- # Load the ASR pipeline for speech-to-text
8
- whisper_asr = pipeline('automatic-speech-recognition', model='facebook/wav2vec2-base-960h')
9
-
10
- # Load the text generation pipeline for text-to-text conversion
11
- text_generator = pipeline("text2text-generation", model="t5-base", tokenizer="t5-base", framework="pt")
12
-
13
- # Streamlit UI
14
- st.title("Automatic Speech Recognition and Text Generation")
15
-
16
- # Dropdown menu to select audio file types
17
- selected_file_type = st.selectbox("Select audio file type", options=["WAV", "MP3"])
18
-
19
- # Map selected file type to corresponding file extensions
20
- file_extensions = {
21
- "WAV": ["wav"],
22
- "MP3": ["mp3"]
23
- }
24
-
25
- # File uploader with improved UI design
26
- uploaded_file = st.file_uploader(f"Upload an audio file ({selected_file_type})", type=file_extensions[selected_file_type])
27
-
28
- # ASR function to transcribe audio to text
29
- def transcribe_audio(audio_file):
30
- if audio_file is not None:
31
- with st.spinner("Transcribing..."):
32
- # Read the content of the uploaded file
33
- audio_content = audio_file.read()
34
-
35
- # Load the audio using torchaudio.load
36
- waveform, sample_rate = torchaudio.load(BytesIO(audio_content))
37
-
38
- # Convert to mono if stereo
39
- if waveform.shape[0] > 1:
40
- waveform = torch.mean(waveform, dim=0, keepdim=True)
41
-
42
- # Convert the waveform to numpy array
43
- waveform_np = waveform.numpy()
44
-
45
- # Transcribe the audio
46
- audio_text = whisper_asr(waveform_np)
47
- st.success("Transcription Complete!")
48
- return audio_text[0]['transcription']
49
-
50
- # Main app logic
51
- if uploaded_file is not None:
52
- st.audio(uploaded_file, format='audio/mp3', start_time=0)
53
- if st.button("Transcribe"):
54
- transcribed_text = transcribe_audio(uploaded_file)
55
- if transcribed_text:
56
- st.subheader("Transcription:")
57
- st.write(transcribed_text)
58
-
59
- # Convert the transcribed audio to text
60
- with st.spinner("Converting audio to text..."):
61
- generated_text = text_generator(transcribed_text)[0]['generated_text']
62
- st.success("Text Conversion Complete!")
63
- st.subheader("Generated Text:")
64
- st.write(generated_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
 
1
+ import torch
2
  from transformers import pipeline
3
+ import gradio as gr
4
+
5
+ MODEL_NAME = "JackismyShephard/whisper-tiny-finetuned-minds14"
6
+ BATCH_SIZE = 8
7
+
8
+ device = 0 if torch.cuda.is_available() else "cpu"
9
+
10
+ pipe = pipeline(
11
+ task="automatic-speech-recognition",
12
+ model=MODEL_NAME,
13
+ chunk_length_s=30,
14
+ device=device,
15
+ )
16
+
17
+
18
+ # Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
19
+ def format_timestamp(
20
+ seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
21
+ ):
22
+ if seconds is not None:
23
+ milliseconds = round(seconds * 1000.0)
24
+
25
+ hours = milliseconds // 3_600_000
26
+ milliseconds -= hours * 3_600_000
27
+
28
+ minutes = milliseconds // 60_000
29
+ milliseconds -= minutes * 60_000
30
+
31
+ seconds = milliseconds // 1_000
32
+ milliseconds -= seconds * 1_000
33
+
34
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
35
+ return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
36
+ else:
37
+ # we have a malformed timestamp so just return it as is
38
+ return seconds
39
+
40
+
41
+ def transcribe(file, return_timestamps):
42
+ outputs = pipe(
43
+ file,
44
+ batch_size=BATCH_SIZE,
45
+ return_timestamps=return_timestamps,
46
+ )
47
+ text = outputs["text"]
48
+ if return_timestamps:
49
+ timestamps = outputs["chunks"]
50
+ timestamps = [
51
+ f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
52
+ for chunk in timestamps
53
+ ]
54
+ text = "\n".join(str(feature) for feature in timestamps)
55
+ return text
56
+
57
+
58
+ demo = gr.Interface(
59
+ fn=transcribe,
60
+ inputs=[
61
+ gr.Audio(label="Audio", type="filepath"),
62
+ gr.Checkbox(label="Return timestamps"),
63
+ ],
64
+ outputs=gr.Textbox(show_copy_button=True, label="Text"),
65
+ title="Automatic Speech Recognition",
66
+ description=(
67
+ "Transcribe or translate long-form audio file or microphone inputs with the click of a button! Demo uses the"
68
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe or translate audio"
69
+ " of arbitrary length."
70
+ ),
71
+ examples=[
72
+ ["examples/example.flac", False],
73
+ ["examples/example.flac", True],
74
+ ],
75
+ cache_examples=True,
76
+ allow_flagging="never",
77
+ )
78
+
79
+ demo.launch()
80
 
81