Spaces:

abduaziz
/

stt_ner

Sleeping

App Files Files Community

abduaziz commited on Dec 12, 2024

Commit

f122ddf

verified ·

1 Parent(s): 071e2a6

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +2 -8
app.py +29 -0
pipe.py +135 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Stt Ner
-emoji: 🐨
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
 sdk_version: 5.8.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: stt_ner
+app_file: app.py
 sdk: gradio
 sdk_version: 5.8.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import gradio as gr
+import os
+from pipe import process_audio_pipeline, AudioSpeechNERPipeline
+from huggingface_hub import login
+def create_gradio_interface():
+    # Create Gradio interface
+    iface = gr.Interface(
+        fn=process_audio_pipeline,
+        inputs=gr.Audio(type="filepath", label="Upload Audio"),
+        outputs=[
+            gr.Textbox(label="Transcription"),
+            gr.Textbox(label="Named Entities")
+        ],
+        title="Uzbek Speech Recognition and Named Entity Recognition",
+        description="Upload an Uzbek audio file (MP3 or WAV) to transcribe and extract named entities."
+    )
+    return iface
+def main():
+    # Create and launch the Gradio interface
+    demo = create_gradio_interface()
+    demo.launch(share=True)
+if __name__ == "__main__":
+    os.environ['HF_TOKEN'] = os.getenv("HUGGINGFACE_TOKEN")
+    login()
+    AudioSpeechNERPipeline()
+    main()

pipe.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+import librosa
+from transformers import pipeline
+labels = {0: 'O',
+ 1: 'B-DATE',
+ 2: 'B-EVENT',
+ 3: 'B-LOC',
+ 4: 'B-ORG',
+ 5: 'B-PER',
+ 6: 'I-DATE',
+ 7: 'I-EVENT',
+ 8: 'I-LOC',
+ 9: 'I-ORG',
+ 10: 'I-PER'}
+class AudioSpeechNERPipeline:
+    def __init__(self,
+                 stt_model_name='abduaziz/whisper-small-uz',
+                 ner_model_name='abduaziz/bert-ner-uz',
+                 stt_language='uz'):
+        # Initialize Speech-to-Text pipeline with timestamp support
+        self.stt_pipeline = pipeline(
+            task="automatic-speech-recognition",
+            model=stt_model_name,
+            return_timestamps=True  # Enable timestamp support
+        )
+        # Initialize NER pipeline
+        self.ner_pipeline = pipeline(
+            task="ner",
+            model=ner_model_name
+        )
+    def chunk_audio(self, audio_path, chunk_duration=30):
+        """
+        Chunk long audio files into 30-second segments
+        """
+        # Load audio file
+        audio, sample_rate = librosa.load(audio_path, sr=16000)
+        # Calculate chunk size
+        chunk_samples = chunk_duration * sample_rate
+        # Create chunks
+        chunks = []
+        for start in range(0, len(audio), chunk_samples):
+            chunk = audio[start:start+chunk_samples]
+            chunks.append({
+                'array': chunk,
+                'sampling_rate': 16000
+            })
+        return chunks
+    def transcribe_audio(self, audio_path):
+        """
+        Handle audio transcription for files longer than 30 seconds
+        """
+        # Check audio length
+        audio, sample_rate = librosa.load(audio_path, sr=16000)
+        # If audio is longer than 30 seconds, chunk it
+        if len(audio) / sample_rate > 30:
+            audio_chunks = self.chunk_audio(audio_path)
+            transcriptions = []
+            for chunk in audio_chunks:
+                # Transcribe each chunk
+                chunk_transcription = self.stt_pipeline(chunk)
+                transcriptions.append(chunk_transcription['text'])
+            # Combine transcriptions
+            full_transcription = " ".join(transcriptions)
+        else:
+            # Process audio normally for short files
+            full_transcription = self.stt_pipeline({
+                'array': audio,
+                'sampling_rate': 16000
+            })['text']
+        return full_transcription
+    def process_audio(self, audio_path):
+        # Transcribe audio
+        transcription = self.transcribe_audio(audio_path)
+        # Extract named entities
+        entities = self.ner_pipeline(transcription)
+        return {
+            'filename': os.path.basename(audio_path),
+            'transcription': transcription,
+            'entities': entities
+        }
+def create_ner_html(entities):
+    """
+    Create HTML representation of named entities
+    """
+    if not entities:
+        return "No named entities found."
+    html = "<div style='background-color:#f0f0f0; padding:10px; border-radius:5px;'>"
+    html += "<h3>Named Entities:</h3>"
+    html += "<table style='width:100%; border-collapse:collapse;'>"
+    html += "<tr><th style='border:1px solid #ddd; padding:8px;'>Word</th><th style='border:1px solid #ddd; padding:8px;'>Entity Type</th></tr>"
+    for entity in entities:
+        new_entity = labels[int(entity['entity'].split("_")[-1])]
+        html += f"<tr>" \
+                f"<td style='border:1px solid #ddd; padding:8px;'>{entity['word']}</td>" \
+                f"<td style='border:1px solid #ddd; padding:8px;'>{new_entity}</td>" \
+                f"</tr>"
+    html += "</table></div>"
+    return html
+def process_audio_pipeline(audio):
+    """
+    Gradio interface function to process audio
+    """
+    # Initialize pipeline
+    pipeline = AudioSpeechNERPipeline()
+    try:
+        # Process the audio
+        transcription, entities = pipeline.process_audio(audio)
+        # Create HTML for entities
+        entities_html = create_ner_html(entities)
+        return transcription, entities_html
+    except Exception as e:
+        return f"Error processing audio: {str(e)}", ""

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers
+seqeval
+accelerate
+soundfile
+librosa
+gradio
+huggingface_hub