abduaziz commited on
Commit
f122ddf
·
verified ·
1 Parent(s): 071e2a6

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +2 -8
  2. app.py +29 -0
  3. pipe.py +135 -0
  4. requirements.txt +7 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Stt Ner
3
- emoji: 🐨
4
- colorFrom: indigo
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.8.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: stt_ner
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.8.0
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pipe import process_audio_pipeline, AudioSpeechNERPipeline
4
+ from huggingface_hub import login
5
+
6
+ def create_gradio_interface():
7
+ # Create Gradio interface
8
+ iface = gr.Interface(
9
+ fn=process_audio_pipeline,
10
+ inputs=gr.Audio(type="filepath", label="Upload Audio"),
11
+ outputs=[
12
+ gr.Textbox(label="Transcription"),
13
+ gr.Textbox(label="Named Entities")
14
+ ],
15
+ title="Uzbek Speech Recognition and Named Entity Recognition",
16
+ description="Upload an Uzbek audio file (MP3 or WAV) to transcribe and extract named entities."
17
+ )
18
+ return iface
19
+
20
+ def main():
21
+ # Create and launch the Gradio interface
22
+ demo = create_gradio_interface()
23
+ demo.launch(share=True)
24
+
25
+ if __name__ == "__main__":
26
+ os.environ['HF_TOKEN'] = os.getenv("HUGGINGFACE_TOKEN")
27
+ login()
28
+ AudioSpeechNERPipeline()
29
+ main()
pipe.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import librosa
3
+ from transformers import pipeline
4
+
5
+ labels = {0: 'O',
6
+ 1: 'B-DATE',
7
+ 2: 'B-EVENT',
8
+ 3: 'B-LOC',
9
+ 4: 'B-ORG',
10
+ 5: 'B-PER',
11
+ 6: 'I-DATE',
12
+ 7: 'I-EVENT',
13
+ 8: 'I-LOC',
14
+ 9: 'I-ORG',
15
+ 10: 'I-PER'}
16
+
17
+ class AudioSpeechNERPipeline:
18
+ def __init__(self,
19
+ stt_model_name='abduaziz/whisper-small-uz',
20
+ ner_model_name='abduaziz/bert-ner-uz',
21
+ stt_language='uz'):
22
+ # Initialize Speech-to-Text pipeline with timestamp support
23
+ self.stt_pipeline = pipeline(
24
+ task="automatic-speech-recognition",
25
+ model=stt_model_name,
26
+ return_timestamps=True # Enable timestamp support
27
+ )
28
+ # Initialize NER pipeline
29
+ self.ner_pipeline = pipeline(
30
+ task="ner",
31
+ model=ner_model_name
32
+ )
33
+
34
+ def chunk_audio(self, audio_path, chunk_duration=30):
35
+ """
36
+ Chunk long audio files into 30-second segments
37
+ """
38
+ # Load audio file
39
+ audio, sample_rate = librosa.load(audio_path, sr=16000)
40
+
41
+ # Calculate chunk size
42
+ chunk_samples = chunk_duration * sample_rate
43
+
44
+ # Create chunks
45
+ chunks = []
46
+ for start in range(0, len(audio), chunk_samples):
47
+ chunk = audio[start:start+chunk_samples]
48
+ chunks.append({
49
+ 'array': chunk,
50
+ 'sampling_rate': 16000
51
+ })
52
+
53
+ return chunks
54
+
55
+ def transcribe_audio(self, audio_path):
56
+ """
57
+ Handle audio transcription for files longer than 30 seconds
58
+ """
59
+ # Check audio length
60
+ audio, sample_rate = librosa.load(audio_path, sr=16000)
61
+
62
+ # If audio is longer than 30 seconds, chunk it
63
+ if len(audio) / sample_rate > 30:
64
+ audio_chunks = self.chunk_audio(audio_path)
65
+ transcriptions = []
66
+
67
+ for chunk in audio_chunks:
68
+ # Transcribe each chunk
69
+ chunk_transcription = self.stt_pipeline(chunk)
70
+ transcriptions.append(chunk_transcription['text'])
71
+
72
+ # Combine transcriptions
73
+ full_transcription = " ".join(transcriptions)
74
+ else:
75
+ # Process audio normally for short files
76
+ full_transcription = self.stt_pipeline({
77
+ 'array': audio,
78
+ 'sampling_rate': 16000
79
+ })['text']
80
+
81
+ return full_transcription
82
+
83
+ def process_audio(self, audio_path):
84
+ # Transcribe audio
85
+ transcription = self.transcribe_audio(audio_path)
86
+
87
+ # Extract named entities
88
+ entities = self.ner_pipeline(transcription)
89
+
90
+ return {
91
+ 'filename': os.path.basename(audio_path),
92
+ 'transcription': transcription,
93
+ 'entities': entities
94
+ }
95
+
96
+ def create_ner_html(entities):
97
+ """
98
+ Create HTML representation of named entities
99
+ """
100
+ if not entities:
101
+ return "No named entities found."
102
+
103
+ html = "<div style='background-color:#f0f0f0; padding:10px; border-radius:5px;'>"
104
+ html += "<h3>Named Entities:</h3>"
105
+ html += "<table style='width:100%; border-collapse:collapse;'>"
106
+ html += "<tr><th style='border:1px solid #ddd; padding:8px;'>Word</th><th style='border:1px solid #ddd; padding:8px;'>Entity Type</th></tr>"
107
+
108
+ for entity in entities:
109
+ new_entity = labels[int(entity['entity'].split("_")[-1])]
110
+ html += f"<tr>" \
111
+ f"<td style='border:1px solid #ddd; padding:8px;'>{entity['word']}</td>" \
112
+ f"<td style='border:1px solid #ddd; padding:8px;'>{new_entity}</td>" \
113
+ f"</tr>"
114
+
115
+ html += "</table></div>"
116
+ return html
117
+
118
+ def process_audio_pipeline(audio):
119
+ """
120
+ Gradio interface function to process audio
121
+ """
122
+ # Initialize pipeline
123
+ pipeline = AudioSpeechNERPipeline()
124
+
125
+ try:
126
+ # Process the audio
127
+ transcription, entities = pipeline.process_audio(audio)
128
+
129
+ # Create HTML for entities
130
+ entities_html = create_ner_html(entities)
131
+
132
+ return transcription, entities_html
133
+
134
+ except Exception as e:
135
+ return f"Error processing audio: {str(e)}", ""
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ seqeval
3
+ accelerate
4
+ soundfile
5
+ librosa
6
+ gradio
7
+ huggingface_hub