import datetime from jinja2 import Environment import tempfile import pandas as pd import os source_languages = { "en": "English", "zh": "Chinese", "de": "German", "es": "Spanish", "ru": "Russian", "ko": "Korean", "fr": "French", "ja": "Japanese", "pt": "Portuguese", "tr": "Turkish", "pl": "Polish", "ca": "Catalan", "nl": "Dutch", "ar": "Arabic", "sv": "Swedish", "it": "Italian", "id": "Indonesian", "hi": "Hindi", "fi": "Finnish", "vi": "Vietnamese", "he": "Hebrew", "uk": "Ukrainian", "el": "Greek", "ms": "Malay", "cs": "Czech", "ro": "Romanian", "da": "Danish", "hu": "Hungarian", "ta": "Tamil", "no": "Norwegian", "th": "Thai", "ur": "Urdu", "hr": "Croatian", "bg": "Bulgarian", "lt": "Lithuanian", "la": "Latin", "mi": "Maori", "ml": "Malayalam", "cy": "Welsh", "sk": "Slovak", "te": "Telugu", "fa": "Persian", "lv": "Latvian", "bn": "Bengali", "sr": "Serbian", "az": "Azerbaijani", "sl": "Slovenian", "kn": "Kannada", "et": "Estonian", "mk": "Macedonian", "br": "Breton", "eu": "Basque", "is": "Icelandic", "hy": "Armenian", "ne": "Nepali", "mn": "Mongolian", "bs": "Bosnian", "kk": "Kazakh", "sq": "Albanian", "sw": "Swahili", "gl": "Galician", "mr": "Marathi", "pa": "Punjabi", "si": "Sinhala", "km": "Khmer", "sn": "Shona", "yo": "Yoruba", "so": "Somali", "af": "Afrikaans", "oc": "Occitan", "ka": "Georgian", "be": "Belarusian", "tg": "Tajik", "sd": "Sindhi", "gu": "Gujarati", "am": "Amharic", "yi": "Yiddish", "lo": "Lao", "uz": "Uzbek", "fo": "Faroese", "ht": "Haitian creole", "ps": "Pashto", "tk": "Turkmen", "nn": "Nynorsk", "mt": "Maltese", "sa": "Sanskrit", "lb": "Luxembourgish", "my": "Myanmar", "bo": "Tibetan", "tl": "Tagalog", "mg": "Malagasy", "as": "Assamese", "tt": "Tatar", "haw": "Hawaiian", "ln": "Lingala", "ha": "Hausa", "ba": "Bashkir", "jw": "Javanese", "su": "Sundanese", } whisper_models = ["base", "small", "medium", "large"] def zip_files(config): """ Zip together a list of files returning the name of the output file. config is a dictionary like: config = { "files": ['file1.txt', 'file2.txt', 'file3.txt'], "outputname = "outputfilename" } """ from zipfile import ZipFile files = config['files'] now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0] outputname = now + "-" + config['input_name'].split('.')[0] with ZipFile(f"{outputname}.zip", "w") as zipObj: for idx, fname in enumerate(files): zipObj.write(fname, os.path.basename(fname)) return f"{outputname}.zip" def output_csv(config): transcript = config['transcript'] outputname = config['outputname'] output_dir = config['output_dir'] csv_file = output_dir + f"/{outputname}.csv" pd.DataFrame(transcript).to_csv(csv_file) print("Saved CSV to" + csv_file) return csv_file def output_markdown(config): template = config['template'] outputname = config['outputname'] transcript = config['transcript'] output_dir = config['output_dir'] if template == None: template = """ {% for part in transcript -%} **{{ part.Speaker }}**: *{{ part.Start }} - {{ part.End }}* {{ part.Text }}
{% endfor %} """ environment = Environment() templ = environment.from_string(template) # Output a list of dictionaries using 'records' trans_dict = pd.DataFrame(transcript).to_dict('records') markdown_out = templ.render(transcript=trans_dict) markdown_file = output_dir + f"/{outputname}.md" with open(markdown_file, "w", encoding="utf-8") as message: message.write(markdown_out) print(f"...wrote {markdown_file}") return markdown_file def output_docx(config): outputname = config['outputname'] output_dir = config['output_dir'] if config['markdown'] == False: markdown_file = output_markdown(config) else: markdown_file = output_dir + f"/{outputname}.md" doc_file = config['output_dir'] + f"/{outputname}.docx" os.system(f'pandoc -i "{markdown_file}" -o "{doc_file}"') print(f"...wrote {markdown_file}") if config['markdown'] == False: os.remove(f'{output_dir}/{outputname}.md') return doc_file def otheroutputs(transcript, csv=True, markdown=True, docx=True, upload_name="input.mp3"): config = {} now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0] outputname = now + "-" + upload_name.split('.')[0] output_dir = tempfile.mkdtemp() files = [] config['input_name'] = upload_name config['outputname'] = outputname config['output_dir'] = output_dir config['transcript'] = transcript config['markdown'] = markdown config['template'] = None # Placeholder to pass through custom jinja templates at a later date if csv: files.append(output_csv(config)) if markdown: files.append(output_markdown(config)) if docx: files.append(output_docx(config)) config['files'] = files return config