micahg commited on
Commit
4dc8f4b
1 Parent(s): ba99dff

Initial commit

Browse files
Files changed (5) hide show
  1. .env +10 -0
  2. .gitignore +1 -0
  3. app.py +38 -0
  4. environment.py +18 -0
  5. functions.py +111 -0
.env ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # multilingual, english-only
2
+ DEFAULT_MODEL_LANGUAGE=multilingual
3
+ # checkpoint
4
+ DEFAULT_MODEL=model/checkpoint-3520
5
+ # auto, english, german, ...
6
+ DEFAULT_LANGUAGE=bengali
7
+ # relative or full path
8
+ OUTPUT_PATH=./temp_dir
9
+ # -1, 0
10
+ DEVICE=0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from functions import transcribe_eaf
3
+
4
+ with gr.Blocks(title="Rohingya EAF Transcriber") as page:
5
+ gr.Markdown("## Rohingya EAF Transcriber")
6
+ with gr.Row():
7
+ input_eaf_file = gr.File(
8
+ label="Upload EAF File",
9
+ file_count="single",
10
+ file_types=['.eaf'],
11
+ type="filepath"
12
+ )
13
+ input_audio_file = gr.File(
14
+ label="Upload Audio File",
15
+ file_count="single",
16
+ file_types=['audio'],
17
+ type="filepath"
18
+ )
19
+ with gr.Row():
20
+ tier = gr.Textbox(
21
+ label="Enter the Tier Type"
22
+ )
23
+ with gr.Row():
24
+ submit_button = gr.Button(
25
+ "Transcribe"
26
+ )
27
+ with gr.Row():
28
+ output = gr.File(
29
+ label="Transcribed EAF File"
30
+ )
31
+
32
+ submit_button.click(
33
+ fn=transcribe_eaf,
34
+ inputs=[input_eaf_file, input_audio_file, tier],
35
+ outputs=output
36
+ )
37
+
38
+ page.launch()
environment.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import getenv, chdir
2
+ from os.path import dirname
3
+ from tempfile import gettempdir
4
+ from pathlib import Path
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ # switch to the project directory
9
+ PROJECT_PATH = dirname(__file__)
10
+ chdir(PROJECT_PATH)
11
+
12
+ # load environment variables
13
+ DEFAULT_MODEL_LANGUAGE = getenv("DEFAULT_MODEL_LANGUAGE")
14
+ DEFAULT_MODEL = getenv("DEFAULT_MODEL")
15
+ DEFAULT_LANGUAGE = getenv("DEFAULT_LANGUAGE")
16
+ TEMP_PATH = Path(gettempdir())
17
+ OUTPUT_PATH = Path(getenv("OUTPUT_PATH"))
18
+ DEVICE=int(getenv("DEVICE"))
functions.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf8
2
+ import os
3
+ import shutil
4
+ import sys
5
+ import subprocess
6
+ import xml.etree.ElementTree as ET
7
+ from transformers import pipeline
8
+ from environment import DEFAULT_MODEL_LANGUAGE, DEFAULT_MODEL, DEFAULT_LANGUAGE, DEVICE
9
+
10
+ # class for annotation segments
11
+ class Segment:
12
+ def __init__(self, segment_id: str, start: int, end: int):
13
+ self.segment_id:str = segment_id
14
+ self.start:int = start
15
+ self.end:int = end
16
+ self.transcription:str = ''
17
+ #self.ts_start:str = ts_start
18
+ #self.ts_end:str = ts_end
19
+
20
+ def getTimeSlots(eaf):
21
+ time_slot_dic = {}
22
+
23
+ order = eaf.find('TIME_ORDER')
24
+ for slot in order:
25
+ time_slot_dic[slot.get('TIME_SLOT_ID')] = slot.get('TIME_VALUE')
26
+
27
+ return time_slot_dic
28
+
29
+ def getAnnotationSegments(eaf, tier_type):
30
+ segment_list = []
31
+
32
+ time_slot_dic = getTimeSlots(eaf)
33
+
34
+ for tier in eaf.findall('TIER'):
35
+ if tier.get('LINGUISTIC_TYPE_REF') == tier_type:
36
+ for annotation in tier:
37
+ alignable_annotation = annotation.find('ALIGNABLE_ANNOTATION')
38
+ segment_id = alignable_annotation.get('ANNOTATION_ID')
39
+ start = time_slot_dic[alignable_annotation.get('TIME_SLOT_REF1')]
40
+ end = time_slot_dic[alignable_annotation.get('TIME_SLOT_REF2')]
41
+ segment_list.append(Segment(segment_id, start, end))
42
+
43
+ return segment_list
44
+
45
+ def splice_audio(audio_path, segment_id, start, end, temp_dir):
46
+ file_path = f"{temp_dir}/{segment_id}.wav"
47
+
48
+ if os.path.exists(file_path):
49
+ os.remove(file_path)
50
+
51
+ subprocess.call([
52
+ "ffmpeg",
53
+ "-loglevel", "fatal",
54
+ "-hide_banner",
55
+ "-nostdin",
56
+ "-i", audio_path,
57
+ "-ss", f"{int(start)/1000}",
58
+ "-to", f"{int(end)/1000}",
59
+ file_path
60
+ ])
61
+ return f"{temp_dir}/{segment_id}.wav"
62
+
63
+
64
+ # transcribes a single and returns the transcription
65
+ def transcribe_audio(model_id, audio_path):
66
+ transcribe = pipeline(
67
+ task = "automatic-speech-recognition",
68
+ model = model_id,
69
+ chunk_length_s = 30,
70
+ device = DEVICE,
71
+ )
72
+
73
+ transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(language='bengali', task="transcribe")
74
+
75
+ result = transcribe(audio_path, max_new_tokens=448)
76
+
77
+ transcription = result["text"].strip()
78
+
79
+ #print(f"Transcription for {audio_path}: {transcription}")
80
+
81
+ return transcription
82
+
83
+
84
+ def transcribe_eaf(eaf_path, audio_path, tier_type):
85
+
86
+ eaf_tree = ET.parse(eaf_path)
87
+ eaf_root = eaf_tree.getroot()
88
+
89
+ segment_list = getAnnotationSegments(eaf_root, tier_type)
90
+
91
+ if not os.path.exists('temp_dir'):
92
+ os.makedirs('temp_dir')
93
+
94
+ for segment in segment_list:
95
+ # get the audio segment
96
+ segment_audio_file = splice_audio(audio_path, segment.segment_id, segment.start, segment.end, 'temp_dir')
97
+ segment.transcription = transcribe_audio(DEFAULT_MODEL, segment_audio_file)
98
+ os.remove(segment_audio_file)
99
+
100
+ print(f'{segment.segment_id}\t{segment.transcription}')
101
+ shutil.rmtree('temp_dir')
102
+
103
+ for segment in segment_list:
104
+ for e in eaf_root.iter():
105
+ if e.tag == 'ALIGNABLE_ANNOTATION' and e.get('ANNOTATION_ID') == segment.segment_id:
106
+ e.find('ANNOTATION_VALUE').text = segment.transcription
107
+
108
+ new_eaf_path = f'{eaf_path[:-4]}_autotranscribed.eaf'
109
+ eaf_tree.write(new_eaf_path, encoding='utf-8', xml_declaration=True)
110
+
111
+ return new_eaf_path