Spaces:

ILAD
/

rhg-elan-transcriber

Sleeping

App Files Files Community

micahg commited on Jan 24

Commit

4dc8f4b

•

1 Parent(s): ba99dff

Initial commit

Browse files

Files changed (5) hide show

.env +10 -0
.gitignore +1 -0
app.py +38 -0
environment.py +18 -0
functions.py +111 -0

.env ADDED Viewed

	@@ -0,0 +1,10 @@

+# multilingual, english-only
+DEFAULT_MODEL_LANGUAGE=multilingual
+# checkpoint
+DEFAULT_MODEL=model/checkpoint-3520
+# auto, english, german, ...
+DEFAULT_LANGUAGE=bengali
+# relative or full path
+OUTPUT_PATH=./temp_dir
+# -1, 0
+DEVICE=0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+from functions import transcribe_eaf
+with gr.Blocks(title="Rohingya EAF Transcriber") as page:
+    gr.Markdown("## Rohingya EAF Transcriber")
+    with gr.Row():
+        input_eaf_file = gr.File(
+            label="Upload EAF File",
+            file_count="single",
+            file_types=['.eaf'],
+            type="filepath"
+            )
+        input_audio_file = gr.File(
+            label="Upload Audio File",
+            file_count="single",
+            file_types=['audio'],
+            type="filepath"
+            )
+    with gr.Row():
+        tier = gr.Textbox(
+            label="Enter the Tier Type"
+            )
+    with gr.Row():
+        submit_button = gr.Button(
+            "Transcribe"
+            )
+    with gr.Row():
+        output = gr.File(
+            label="Transcribed EAF File"
+            )
+    submit_button.click(
+        fn=transcribe_eaf,
+        inputs=[input_eaf_file, input_audio_file, tier],
+        outputs=output
+    )
+page.launch()

environment.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from os import getenv, chdir
+from os.path import dirname
+from tempfile import gettempdir
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+# switch to the project directory
+PROJECT_PATH = dirname(__file__)
+chdir(PROJECT_PATH)
+# load environment variables
+DEFAULT_MODEL_LANGUAGE = getenv("DEFAULT_MODEL_LANGUAGE")
+DEFAULT_MODEL = getenv("DEFAULT_MODEL")
+DEFAULT_LANGUAGE = getenv("DEFAULT_LANGUAGE")
+TEMP_PATH = Path(gettempdir())
+OUTPUT_PATH = Path(getenv("OUTPUT_PATH"))
+DEVICE=int(getenv("DEVICE"))

functions.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# coding=utf8
+import os
+import shutil
+import sys
+import subprocess
+import xml.etree.ElementTree as ET
+from transformers import pipeline
+from environment import DEFAULT_MODEL_LANGUAGE, DEFAULT_MODEL, DEFAULT_LANGUAGE, DEVICE
+# class for annotation segments
+class Segment:
+    def __init__(self, segment_id: str, start: int, end: int):
+        self.segment_id:str = segment_id
+        self.start:int = start
+        self.end:int = end
+        self.transcription:str = ''
+        #self.ts_start:str = ts_start
+        #self.ts_end:str = ts_end
+def getTimeSlots(eaf):
+    time_slot_dic = {}
+    order = eaf.find('TIME_ORDER')
+    for slot in order:
+        time_slot_dic[slot.get('TIME_SLOT_ID')] = slot.get('TIME_VALUE')
+    return time_slot_dic
+def getAnnotationSegments(eaf, tier_type):
+    segment_list = []
+    time_slot_dic = getTimeSlots(eaf)
+    for tier in eaf.findall('TIER'):
+        if tier.get('LINGUISTIC_TYPE_REF') == tier_type:
+            for annotation in tier:
+                alignable_annotation = annotation.find('ALIGNABLE_ANNOTATION')
+                segment_id = alignable_annotation.get('ANNOTATION_ID')
+                start = time_slot_dic[alignable_annotation.get('TIME_SLOT_REF1')]
+                end = time_slot_dic[alignable_annotation.get('TIME_SLOT_REF2')]
+                segment_list.append(Segment(segment_id, start, end))
+    return segment_list
+def splice_audio(audio_path, segment_id, start, end, temp_dir):
+    file_path = f"{temp_dir}/{segment_id}.wav"
+    if os.path.exists(file_path):
+        os.remove(file_path)
+    subprocess.call([
+        "ffmpeg",
+        "-loglevel", "fatal",
+        "-hide_banner",
+        "-nostdin",
+        "-i", audio_path,
+        "-ss", f"{int(start)/1000}",
+        "-to", f"{int(end)/1000}",
+        file_path
+    ])
+    return f"{temp_dir}/{segment_id}.wav"
+# transcribes a single and returns the transcription
+def transcribe_audio(model_id, audio_path):
+    transcribe = pipeline(
+        task = "automatic-speech-recognition",
+        model = model_id,
+        chunk_length_s = 30,
+        device = DEVICE,
+    )
+    transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(language='bengali', task="transcribe")
+    result = transcribe(audio_path, max_new_tokens=448)
+    transcription = result["text"].strip()
+    #print(f"Transcription for {audio_path}: {transcription}")
+    return transcription
+def transcribe_eaf(eaf_path, audio_path, tier_type):
+    eaf_tree = ET.parse(eaf_path)
+    eaf_root = eaf_tree.getroot()
+    segment_list = getAnnotationSegments(eaf_root, tier_type)
+    if not os.path.exists('temp_dir'):
+        os.makedirs('temp_dir')
+    for segment in segment_list:
+        # get the audio segment
+        segment_audio_file = splice_audio(audio_path, segment.segment_id, segment.start, segment.end, 'temp_dir')
+        segment.transcription = transcribe_audio(DEFAULT_MODEL, segment_audio_file)
+        os.remove(segment_audio_file)
+        print(f'{segment.segment_id}\t{segment.transcription}')
+    shutil.rmtree('temp_dir')
+    for segment in segment_list:
+        for e in eaf_root.iter():
+            if e.tag == 'ALIGNABLE_ANNOTATION' and e.get('ANNOTATION_ID') == segment.segment_id:
+                e.find('ANNOTATION_VALUE').text = segment.transcription
+    new_eaf_path = f'{eaf_path[:-4]}_autotranscribed.eaf'
+    eaf_tree.write(new_eaf_path, encoding='utf-8', xml_declaration=True)
+    return new_eaf_path