Spaces:

dmedhi
/

audio-transcriber

Sleeping

App Files Files Community

dmedhi commited on Jun 14, 2024

Commit

0e6eb34

1 Parent(s): f7b8f19

add app and dependencies

Browse files

Files changed (4) hide show

.gitignore +2 -0
app.py +41 -0
requirements.txt +107 -0
whisper.py +73 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # audio files
2	+ *.wav

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import streamlit as st
+from whisper import transcribe_audio
+def transcribe(audio_file):
+    return transcribe_audio(audio_file)
+def main():
+    st.set_page_config(page_title="Transcriber", page_icon="💬", layout="wide")
+    st.markdown(
+        """<h1 align="center";>Transcriber</h1>""",
+        unsafe_allow_html=True,
+    )
+    cols = st.columns(2)
+    with cols[0]:
+        with st.container(border=True, height=300):
+            audio_file = st.file_uploader(
+                label="Upload your audio",
+                type=["wav", "mp3"],
+                key="audio_file_uploader",
+            )
+            if audio_file:
+                st.audio(audio_file)
+        sub_btn = st.button("Run", key="sub_btn")
+    with cols[1]:
+        with st.container(border=True, height=400):
+            if sub_btn and audio_file:
+                st.text_area(
+                    label="Transcribed text",
+                    value=transcribe(audio_file.read())["text"],
+                    height=350,
+                )
+            else:
+                st.info("Upload audio file", icon="💡")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,107 @@

+accelerate==0.31.0
+aiofiles==23.2.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+attrs==23.2.0
+blinker==1.8.2
+cachetools==5.3.3
+certifi==2024.6.2
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.2.1
+cycler==0.12.1
+dnspython==2.6.1
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+ffmpy==0.3.2
+filelock==3.15.1
+fonttools==4.53.0
+fsspec==2024.6.0
+gitdb==4.0.11
+GitPython==3.1.43
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.3
+idna==3.7
+importlib_resources==6.4.0
+Jinja2==3.1.4
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+orjson==3.10.5
+packaging==24.1
+pandas==2.2.2
+pillow==10.3.0
+protobuf==4.25.3
+psutil==5.9.8
+pyarrow==16.1.0
+pydantic==2.7.4
+pydantic_core==2.18.4
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.4.8
+safetensors==0.4.3
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+starlette==0.37.2
+streamlit==1.35.0
+sympy==1.12.1
+tenacity==8.3.0
+tokenizers==0.19.1
+toml==0.10.2
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.3.1
+tornado==6.4.1
+tqdm==4.66.4
+transformers==4.41.2
+triton==2.3.1
+typer==0.12.3
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.1
+uvicorn==0.30.1
+uvloop==0.19.0
+watchdog==4.0.1
+watchfiles==0.22.0
+websockets==11.0.3

whisper.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+class Whisper:
+    """Whisper - audio transcriber class"""
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    def __init__(self, model_id: str = "openai/whisper-base") -> None:
+        self.model_id = model_id
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            torch_dtype=self.torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+        self.model.to(self.device)
+        self.processor = AutoProcessor.from_pretrained(model_id)
+    @property
+    def model_name(self):
+        """
+        Getter method for retrieving the model name.
+        """
+        return self.model_id
+    def save(self, save_dir: str):
+        """
+        Saves the model and processor to the specified directory.
+        Args:
+            save_dir (str): The directory where the model and processor will be saved.
+        """
+        self.model.save_pretrained(f"{save_dir}/model")
+        self.processor.save_pretrained(f"{save_dir}/processor")
+    def load(self, load_dir: str):
+        """
+        Load the model and processor from the specified directory.
+        Args:
+            load_dir (str): The directory from which to load the model and processor.
+        """
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(f"{load_dir}/model")
+        self.processor = AutoProcessor.from_pretrained(f"{load_dir}/processor")
+        self.model.to(self.device)
+    def pipeline(self):
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=self.model,
+            tokenizer=self.processor.tokenizer,
+            feature_extractor=self.processor.feature_extractor,
+            max_new_tokens=128,
+            chunk_length_s=15,
+            batch_size=16,
+            return_timestamps=True,
+            torch_dtype=self.torch_dtype,
+            device=self.device,
+        )
+        return pipe
+def transcribe_audio(file):
+    whisper = Whisper()
+    pipe = whisper.pipeline()
+    result = pipe(file)
+    return result