import io import os.path import zipfile import bentoml import tempfile from pathlib import Path from docx.enum.text import WD_COLOR_INDEX class TranscriptionZipper(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") SUPPORTS_CPU_MULTI_THREADING = True @bentoml.Runnable.method(batchable=False) def zip_transcription(self, transcription_list): zip_buffer = io.BytesIO() for t_list in transcription_list: orig_filename = t_list[0] if ".mp3" in orig_filename: orig_filename = orig_filename.removesuffix(".mp3") else: orig_filename = orig_filename.removesuffix(".wav") new_content = create_word_content(orig_filename, t_list[1], t_list[2]) new_content.save(orig_filename + '.docx') # new_content = create_content(t_list[1], t_list[2]) # html string with zipfile.ZipFile(zip_buffer, "a") as zip_file: # zip_file.writestr(file_name + ".html", new_content) zip_file.write(orig_filename + '.docx') # Return the zip file as bytes return zip_buffer.getvalue() def create_word_content(filename, old_content, new_content): from docx import Document document = Document() document.add_heading(filename, 1) p = document.add_paragraph() run = p.add_run() run.add_break() old_content = old_content.split(" ") changed_content = new_content.split(" ") both = [word for word in changed_content if word in old_content] i = 0 while i < len(changed_content): try: if changed_content[i] == old_content[i]: p.add_run(" " + changed_content[i]) # new_content += " " + changed_content[i] both.pop(0) old_content.pop(0) changed_content.pop(0) else: old_pos = old_content.index(both[0]) new_pos = changed_content.index(both[0]) p.add_run(" " + " ".join(old_content[0:old_pos])).font.strike = True p.add_run(" " + " ".join(changed_content[0:new_pos])).font.highlight_color = WD_COLOR_INDEX.YELLOW # new_content += " " + " ".join(old_content[0:old_pos]) + " " # new_content += " ".join(changed_content[0:new_pos]) del old_content[0:old_pos] del changed_content[0:new_pos] except: p.add_run(" ".join(old_content[i:])).font.strike = True p.add_run(" ".join(changed_content[i:])).font.highlight_color = WD_COLOR_INDEX.YELLOW # new_content += " " + " ".join(old_content[i:]) + " " # new_content += " " + " ".join(changed_content[i:]) break return document def create_content(old_content, new_content): old_content = old_content.split(" ") changed_content = new_content.split(" ") both = [word for word in changed_content if word in old_content] new_content = "" i = 0 while i < len(changed_content): try: if changed_content[i] == old_content[i]: new_content += " " + changed_content[i] both.pop(0) old_content.pop(0) changed_content.pop(0) else: old_pos = old_content.index(both[0]) new_pos = changed_content.index(both[0]) new_content += " " + " ".join(old_content[0:old_pos]) + " " new_content += " ".join(changed_content[0:new_pos]) del old_content[0:old_pos] del changed_content[0:new_pos] except: new_content += " " + " ".join(old_content[i:]) + " " new_content += " " + " ".join(changed_content[i:]) break return new_content