File size: 4,366 Bytes
9883a18
 
 
 
 
 
 
1cd56d0
9883a18
 
 
1cd56d0
 
9883a18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb62ff9
 
9883a18
eb62ff9
 
 
 
9883a18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f786dda
9883a18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bec2abe
eb62ff9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
import sparknlp
import os
import pandas as pd
import librosa

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import *
import pyspark.sql.functions as F

# Page configuration
st.set_page_config(
    layout="wide", 
    initial_sidebar_state="auto"
)

# Custom CSS for styling
st.markdown("""
    <style>
        .main-title {
            font-size: 36px;
            color: #4A90E2;
            font-weight: bold;
            text-align: center;
        }
        .section {
            background-color: #f9f9f9;
            padding: 10px;
            border-radius: 10px;
            margin-top: 10px;
        }
        .section p, .section ul {
            color: #666666;
        }
    </style>
""", unsafe_allow_html=True)

@st.cache_resource
def init_spark():
    """Initialize Spark NLP."""
    return sparknlp.start()

@st.cache_resource
def create_pipeline(model):
    """Create a Spark NLP pipeline for audio processing."""
    audio_assembler = AudioAssembler() \
        .setInputCol("audio_content") \
        .setOutputCol("audio_assembler")

    speech_to_text = Wav2Vec2ForCTC \
        .pretrained(model)\
        .setInputCols("audio_assembler") \
        .setOutputCol("text")

    pipeline = Pipeline(stages=[
        audio_assembler,
        speech_to_text
    ])
    return pipeline

def fit_data(pipeline, fed_data):
    """Fit the data into the pipeline and return the transcription."""
    data, sampling_rate = librosa.load(fed_data, sr=16000)
    data = data.tolist()
    spark_df = spark.createDataFrame([[data]], ["audio_content"])

    model = pipeline.fit(spark_df)
    lp = LightPipeline(model)
    lp_result = lp.fullAnnotate(data)[0]
    return lp_result

def save_uploadedfile(uploadedfile, path):
    """Save the uploaded file to the specified path."""
    filepath = os.path.join(path, uploadedfile.name)
    with open(filepath, "wb") as f:
        if hasattr(uploadedfile, 'getbuffer'):
            f.write(uploadedfile.getbuffer())
        else:
            f.write(uploadedfile.read())

# Sidebar content
model_list = [
    "asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman", 
    "asr_wav2vec2_base_100h_13K_steps",
    "asr_wav2vec2_base_100h_ngram",
    "asr_wav2vec2_base_100h_by_facebook",
    "asr_wav2vec2_base_100h_test",
    "asr_wav2vec2_base_960h"
]

model = st.sidebar.selectbox(
    "Choose the pretrained model",
    model_list,
    help="For more info about the models visit: https://sparknlp.org/models"
)

# Main content
st.markdown('<div class="main-title">Speech Recognition With Wav2Vec2ForCTC</div>', unsafe_allow_html=True)
st.markdown('<div class="section"><p>This demo transcribes audio files into texts using the <code>Wav2Vec2ForCTC</code> Annotator and advanced speech recognition models.</p></div>', unsafe_allow_html=True)

# Reference notebook link in sidebar
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown("""
    <a href="https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/17.0.Automatic_Speech_Recognition_Wav2Vec2.ipynb">
        <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
    </a>
""", unsafe_allow_html=True)

# Load examples
AUDIO_FILE_PATH = "inputs"
audio_files = sorted(os.listdir(AUDIO_FILE_PATH))

selected_audio = st.selectbox("Select an audio", audio_files)

# Creating a simplified Python list of audio file types
audio_file_types = ["mp3", "flac", "wav", "aac", "ogg", "aiff", "wma", "m4a", "ape", "dsf", "dff", "midi", "mid", "opus", "amr"]
uploadedfile = st.file_uploader("Try it for yourself!", type=audio_file_types)

if uploadedfile:
    selected_audio = f"{AUDIO_FILE_PATH}/{uploadedfile.name}"
    save_uploadedfile(uploadedfile, AUDIO_FILE_PATH)
elif selected_audio:
    selected_audio = f"{AUDIO_FILE_PATH}/{selected_audio}"

# Audio playback and transcription
st.subheader("Play Audio")

with open(selected_audio, 'rb') as audio_file:
    audio_bytes = audio_file.read()
st.audio(audio_bytes)

spark = init_spark()
pipeline = create_pipeline(model)
output = fit_data(pipeline, selected_audio)

st.subheader(f"Transcription:")
st.markdown(f"{(output['text'][0].result).title()}")