jduckles commited on
Commit
97598b4
β€’
1 Parent(s): 82897f5
Files changed (10) hide show
  1. Dockerfile +17 -0
  2. README.md +10 -6
  3. app.py +263 -0
  4. docker-compose.yml +15 -0
  5. download-models.py +8 -0
  6. packages.txt +2 -0
  7. peanut.mp3 +0 -0
  8. requirements.txt +23 -0
  9. transcribe +18 -0
  10. util.py +198 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-base-ubuntu22.04
2
+
3
+ ENV TZ=Pacific/Auckland
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+
6
+ RUN apt-get update
7
+ RUN apt-get update && apt-get install -y tzdata git pandoc ffmpeg python3 python3-pip
8
+
9
+ ADD ./requirements.txt /srv/requirements.txt
10
+ RUN pip install -r /srv/requirements.txt
11
+
12
+ RUN pip install --no-cache-dir datasets "huggingface-hub>=0.12.1" "protobuf<4" "click<8.1"
13
+
14
+ ADD . /srv
15
+
16
+ WORKDIR /srv
17
+ CMD ["gradio", "app.py"]
README.md CHANGED
@@ -1,11 +1,15 @@
1
  ---
2
- title: Test Diarize
3
- emoji: πŸ‘€
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: docker
 
 
7
  pinned: false
8
- license: mit
 
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Whisper Speaker Diarization
3
+ emoji: 🎎
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.18.0
8
+ app_file: app.py
9
  pinned: false
10
+ tags:
11
+ - whisper-event
12
+ duplicated_from: vumichien/whisper-speaker-diarization
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import datetime
3
+ import gradio as gr
4
+ import pandas as pd
5
+
6
+ import time
7
+ import os
8
+ import numpy as np
9
+ from sklearn.cluster import AgglomerativeClustering
10
+
11
+ import torch
12
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
13
+ from pyannote.audio import Audio, Pipeline
14
+ from pyannote.core import Segment
15
+
16
+ from gpuinfo import GPUInfo
17
+
18
+ from util import *
19
+ import wave
20
+ import contextlib
21
+ from transformers import pipeline
22
+ import psutil
23
+
24
+ source_language_list = [key[0] for key in source_languages.items()]
25
+
26
+ MODEL_NAME = "openai/whisper-base.en"
27
+ lang = "en"
28
+
29
+ device = 0 if torch.cuda.is_available() else "cpu"
30
+ pipe = pipeline(
31
+ task="automatic-speech-recognition",
32
+ model=MODEL_NAME,
33
+ chunk_length_s=30,
34
+ device=device,
35
+ )
36
+
37
+ pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
38
+
39
+ embedding_model = PretrainedSpeakerEmbedding(
40
+ "speechbrain/spkrec-ecapa-voxceleb",
41
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
42
+
43
+
44
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
45
+ use_auth_token="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")
46
+
47
+
48
+
49
+ def transcribe(microphone, file_upload):
50
+ print("Beginning transcribe...")
51
+ warn_output = ""
52
+ if (microphone is not None) and (file_upload is not None):
53
+ warn_output = (
54
+ "WARNING: You've uploaded an audio file and used the microphone. "
55
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
56
+ )
57
+
58
+ elif (microphone is None) and (file_upload is None):
59
+ return "ERROR: You have to either use the microphone or upload an audio file"
60
+
61
+ file = microphone if microphone is not None else file_upload
62
+
63
+ text = pipe(file)["text"]
64
+
65
+ return warn_output + text
66
+
67
+
68
+ def convert_time(secs):
69
+ return datetime.timedelta(seconds=round(secs))
70
+
71
+
72
+ def speech_to_text(audio_file_path, selected_source_lang, whisper_model, num_speakers, output_types=['csv','docx','md']):
73
+ """
74
+ # Transcribe youtube link using OpenAI Whisper
75
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
76
+ 2. Generating speaker embeddings for each segments.
77
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
78
+
79
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
80
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
81
+ """
82
+ print("Loading model...")
83
+ torch.cuda.empty_cache()
84
+ model = whisper.load_model(whisper_model)
85
+ time_start = time.time()
86
+ try:
87
+ upload_name = audio_file_path.orig_name
88
+ file_name = audio_file_path.name
89
+ except:
90
+ upload_name = "output.mp3"
91
+ file_name = audio_file_path
92
+ if(audio_file_path == None):
93
+ raise ValueError("Error no video input")
94
+
95
+ try:
96
+ _,file_ending = os.path.splitext(f'{file_name}')
97
+ print(f'file ending is {file_ending}')
98
+ audio_file = file_name.replace(file_ending, ".wav")
99
+ print("starting conversion to wav")
100
+ os.system(f'ffmpeg -y -i "{file_name}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
101
+
102
+ # Get duration
103
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
104
+ frames = f.getnframes()
105
+ rate = f.getframerate()
106
+ duration = frames / float(rate)
107
+ print(f"conversion to wav ready, duration of audio file: {duration}")
108
+
109
+ # Transcribe audio
110
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5)
111
+ transcribe_options = dict(task="transcribe", **options)
112
+ result = model.transcribe(audio_file, **transcribe_options)
113
+ segments = result["segments"]
114
+ print("starting whisper done with whisper")
115
+ except Exception as e:
116
+ raise RuntimeError("Error converting video to audio")
117
+
118
+ try:
119
+ # Create embedding
120
+ def segment_embedding(segment):
121
+ audio = Audio()
122
+ start = segment["start"]
123
+ # Whisper overshoots the end timestamp in the last segment
124
+ end = min(duration, segment["end"])
125
+ clip = Segment(start, end)
126
+ waveform, sample_rate = audio.crop(audio_file, clip)
127
+ return embedding_model(waveform[None])
128
+
129
+ embeddings = np.zeros(shape=(len(segments), 192))
130
+ for i, segment in enumerate(segments):
131
+ embeddings[i] = segment_embedding(segment)
132
+ embeddings = np.nan_to_num(embeddings)
133
+ print(f'Embedding shape: {embeddings.shape}')
134
+
135
+ # Assign speaker label
136
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
137
+ labels = clustering.labels_
138
+ for i in range(len(segments)):
139
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
140
+
141
+ # Make output
142
+ objects = {
143
+ 'Start' : [],
144
+ 'End': [],
145
+ 'Speaker': [],
146
+ 'Text': []
147
+ }
148
+ text = ''
149
+ for (i, segment) in enumerate(segments):
150
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
151
+ objects['Start'].append(str(convert_time(segment["start"])))
152
+ objects['Speaker'].append(segment["speaker"])
153
+ if i != 0:
154
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
155
+ objects['Text'].append(text)
156
+ text = ''
157
+ text += segment["text"] + ' '
158
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
159
+ objects['Text'].append(text)
160
+
161
+ time_end = time.time()
162
+ time_diff = time_end - time_start
163
+ memory = psutil.virtual_memory()
164
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
165
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
166
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
167
+ system_info = f"""
168
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
169
+ *Processing time: {time_diff:.5} seconds.*
170
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
171
+ """
172
+ os.remove(file_name)
173
+ print(output_types)
174
+ docx = not set(['docx']).isdisjoint(output_types)
175
+ markdown = not set(['md']).isdisjoint(output_types)
176
+ csv = not set(['csv']).isdisjoint(output_types)
177
+ other_outs = zip_files(otheroutputs(objects, csv=csv, markdown=markdown, docx=docx,upload_name=upload_name))
178
+
179
+ return pd.DataFrame(objects), system_info, other_outs
180
+
181
+ except Exception as e:
182
+ raise RuntimeError("Error Running inference with local model", e)
183
+
184
+
185
+ def main():
186
+ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
187
+ memory = psutil.virtual_memory()
188
+
189
+ try:
190
+ cuda_device_model = {torch.cuda.get_device_name(torch.cuda.current_device())}
191
+ except:
192
+ cuda_device_model = "CUDA not found"
193
+ system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB* Have CUDA?: {torch.cuda.is_available()} CUDA Device: {cuda_device_model}")
194
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
195
+ zip_download = gr.File(label="Output")
196
+ title = "Whisper speaker diarization"
197
+ demo = gr.Blocks(title=title)
198
+ demo.queue(concurrency_count=3)
199
+ demo.encrypt = False
200
+
201
+
202
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in recording", interactive=True)
203
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
204
+ number_speakers = gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)
205
+ out_formats = ["docx","md","csv"]
206
+ output_types = gr.CheckboxGroup(choices=out_formats, value=out_formats, label="Select output types", interactive=True)
207
+
208
+ with demo:
209
+
210
+ with gr.Tab("Transcribe Audio Files"):
211
+
212
+ with gr.Row():
213
+ gr.HTML('<script defer data-domain="transcribe.orgmycology.com" src="https://a.duckles.nz/js/plausible.js"></script>')
214
+ gr.Markdown("""## Transcribe your audio files
215
+
216
+ This tool will help you transcribe audio files, tag the speakers (i.e. Speaker 1, Speaker 2).
217
+
218
+ Steps:
219
+
220
+ 1. Upload file (drag/drop to upload area or click and select)
221
+ 2. Select language
222
+ 2. Select model version (larger size == slower, but higher accuracy)
223
+ 3. Hint at the number of speakers in the audio file (doesn't have to be exact)
224
+ 3. Choose output formats you'd like
225
+ 4. Click Transcribe!
226
+ 5. Wait for it to finish, and download the outputfile
227
+ """)
228
+
229
+ with gr.Row():
230
+ with gr.Column():
231
+
232
+ upload_diarize = gr.File(type="file", label="Upload Audio", interactive=True)
233
+
234
+
235
+ with gr.Row():
236
+ with gr.Column():
237
+ selected_source_lang.render()
238
+ selected_whisper_model.render()
239
+ number_speakers.render()
240
+ output_types.render()
241
+
242
+ transcribe_btn = gr.Button(" 🟒 Transcribe! ")
243
+ transcribe_btn.click(speech_to_text, [upload_diarize, selected_source_lang, selected_whisper_model, number_speakers, output_types], [transcription_df, system_info, zip_download], api_name="diarized_transcribe")
244
+
245
+
246
+
247
+ with gr.Row():
248
+ with gr.Column():
249
+ zip_download.render()
250
+ transcription_df.render()
251
+ system_info.render()
252
+
253
+ demo.launch(show_error=True, debug=True)
254
+
255
+ if __name__ == "__main__":
256
+ import sys
257
+ input_file = sys.argv[1]
258
+ selected_source_lang = "en"
259
+ selected_whisper_model = "base"
260
+ number_speakers = 2
261
+ speech_to_text(input_file, selected_source_lang, selected_whisper_model, number_speakers )
262
+ else:
263
+ main()
docker-compose.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ transcribe:
3
+ build: ./
4
+ restart: unless-stopped
5
+ environment:
6
+ - GRADIO_SERVER_NAME=0.0.0.0
7
+ deploy:
8
+ resources:
9
+ reservations:
10
+ devices:
11
+ - driver: nvidia
12
+ count: 1
13
+ capabilities: [gpu]
14
+ ports:
15
+ - 7860:7860
download-models.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import app
2
+ import sys
3
+ input_file = sys.argv[1]
4
+ selected_source_lang = "en"
5
+ selected_whisper_model = "base"
6
+ number_speakers = 2
7
+ app.speech_to_text(input_file, selected_source_lang, selected_whisper_model, number_speakers )
8
+
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ pandoc
peanut.mp3 ADDED
Binary file (379 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/pyannote/pyannote-audio
3
+ git+https://github.com/openai/whisper.git
4
+ gradio==3.18.0
5
+ ffmpeg-python
6
+ pandas==1.5.0
7
+ pytube==12.1.0
8
+ sacremoses
9
+ sentencepiece
10
+ tokenizers
11
+ --extra-index-url https://download.pytorch.org/whl/cu113
12
+ torch
13
+ pydub
14
+ torchaudio
15
+ tqdm==4.64.1
16
+ EasyNMT==2.0.2
17
+ nltk
18
+ transformers
19
+ pysrt
20
+ psutil==5.9.2
21
+ requests
22
+ gpuinfo
23
+ jinja2
transcribe ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/env python3
2
+
3
+ import requests
4
+ import sys
5
+
6
+ file = open(sys.argv[1])
7
+
8
+ response = requests.post("https://orgmycology-whisper-speaker-diarization.hf.space/run/diarized_transcribe", json={
9
+ "data": [
10
+ {"name":"audio.mp3","data":"data:@file/octet-stream;base64,{base64string}"},
11
+ "en",
12
+ "base",
13
+ 2,
14
+ ]
15
+ }).json()
16
+
17
+ data = response["data"]
18
+
util.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ from jinja2 import Environment
3
+ import tempfile
4
+ import pandas as pd
5
+ import os
6
+
7
+ source_languages = {
8
+ "en": "English",
9
+ "zh": "Chinese",
10
+ "de": "German",
11
+ "es": "Spanish",
12
+ "ru": "Russian",
13
+ "ko": "Korean",
14
+ "fr": "French",
15
+ "ja": "Japanese",
16
+ "pt": "Portuguese",
17
+ "tr": "Turkish",
18
+ "pl": "Polish",
19
+ "ca": "Catalan",
20
+ "nl": "Dutch",
21
+ "ar": "Arabic",
22
+ "sv": "Swedish",
23
+ "it": "Italian",
24
+ "id": "Indonesian",
25
+ "hi": "Hindi",
26
+ "fi": "Finnish",
27
+ "vi": "Vietnamese",
28
+ "he": "Hebrew",
29
+ "uk": "Ukrainian",
30
+ "el": "Greek",
31
+ "ms": "Malay",
32
+ "cs": "Czech",
33
+ "ro": "Romanian",
34
+ "da": "Danish",
35
+ "hu": "Hungarian",
36
+ "ta": "Tamil",
37
+ "no": "Norwegian",
38
+ "th": "Thai",
39
+ "ur": "Urdu",
40
+ "hr": "Croatian",
41
+ "bg": "Bulgarian",
42
+ "lt": "Lithuanian",
43
+ "la": "Latin",
44
+ "mi": "Maori",
45
+ "ml": "Malayalam",
46
+ "cy": "Welsh",
47
+ "sk": "Slovak",
48
+ "te": "Telugu",
49
+ "fa": "Persian",
50
+ "lv": "Latvian",
51
+ "bn": "Bengali",
52
+ "sr": "Serbian",
53
+ "az": "Azerbaijani",
54
+ "sl": "Slovenian",
55
+ "kn": "Kannada",
56
+ "et": "Estonian",
57
+ "mk": "Macedonian",
58
+ "br": "Breton",
59
+ "eu": "Basque",
60
+ "is": "Icelandic",
61
+ "hy": "Armenian",
62
+ "ne": "Nepali",
63
+ "mn": "Mongolian",
64
+ "bs": "Bosnian",
65
+ "kk": "Kazakh",
66
+ "sq": "Albanian",
67
+ "sw": "Swahili",
68
+ "gl": "Galician",
69
+ "mr": "Marathi",
70
+ "pa": "Punjabi",
71
+ "si": "Sinhala",
72
+ "km": "Khmer",
73
+ "sn": "Shona",
74
+ "yo": "Yoruba",
75
+ "so": "Somali",
76
+ "af": "Afrikaans",
77
+ "oc": "Occitan",
78
+ "ka": "Georgian",
79
+ "be": "Belarusian",
80
+ "tg": "Tajik",
81
+ "sd": "Sindhi",
82
+ "gu": "Gujarati",
83
+ "am": "Amharic",
84
+ "yi": "Yiddish",
85
+ "lo": "Lao",
86
+ "uz": "Uzbek",
87
+ "fo": "Faroese",
88
+ "ht": "Haitian creole",
89
+ "ps": "Pashto",
90
+ "tk": "Turkmen",
91
+ "nn": "Nynorsk",
92
+ "mt": "Maltese",
93
+ "sa": "Sanskrit",
94
+ "lb": "Luxembourgish",
95
+ "my": "Myanmar",
96
+ "bo": "Tibetan",
97
+ "tl": "Tagalog",
98
+ "mg": "Malagasy",
99
+ "as": "Assamese",
100
+ "tt": "Tatar",
101
+ "haw": "Hawaiian",
102
+ "ln": "Lingala",
103
+ "ha": "Hausa",
104
+ "ba": "Bashkir",
105
+ "jw": "Javanese",
106
+ "su": "Sundanese",
107
+ }
108
+ whisper_models = ["base", "small", "medium", "large"]
109
+
110
+ def zip_files(config):
111
+ """
112
+ Zip together a list of files returning the name of the output file.
113
+ config is a dictionary like:
114
+ config = {
115
+ "files": ['file1.txt', 'file2.txt', 'file3.txt'],
116
+ "outputname = "outputfilename"
117
+ }
118
+ """
119
+ from zipfile import ZipFile
120
+ files = config['files']
121
+ now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0]
122
+ outputname = now + "-" + config['input_name'].split('.')[0]
123
+ with ZipFile(f"{outputname}.zip", "w") as zipObj:
124
+ for idx, fname in enumerate(files):
125
+ zipObj.write(fname, os.path.basename(fname))
126
+ return f"{outputname}.zip"
127
+
128
+ def output_csv(config):
129
+ transcript = config['transcript']
130
+ outputname = config['outputname']
131
+ output_dir = config['output_dir']
132
+ csv_file = output_dir + f"/{outputname}.csv"
133
+ pd.DataFrame(transcript).to_csv(csv_file)
134
+ print("Saved CSV to" + csv_file)
135
+ return csv_file
136
+
137
+ def output_markdown(config):
138
+ template = config['template']
139
+ outputname = config['outputname']
140
+ transcript = config['transcript']
141
+ output_dir = config['output_dir']
142
+ if template == None:
143
+ template = """
144
+ {% for part in transcript -%}
145
+ **{{ part.Speaker }}**: *{{ part.Start }} - {{ part.End }}*
146
+ {{ part.Text }}
147
+ <br>
148
+ {% endfor %}
149
+ """
150
+ environment = Environment()
151
+ templ = environment.from_string(template)
152
+ # Output a list of dictionaries using 'records'
153
+ trans_dict = pd.DataFrame(transcript).to_dict('records')
154
+ markdown_out = templ.render(transcript=trans_dict)
155
+ markdown_file = output_dir + f"/{outputname}.md"
156
+ with open(markdown_file, "w", encoding="utf-8") as message:
157
+ message.write(markdown_out)
158
+ print(f"...wrote {markdown_file}")
159
+ return markdown_file
160
+
161
+ def output_docx(config):
162
+
163
+ outputname = config['outputname']
164
+ output_dir = config['output_dir']
165
+
166
+ if config['markdown'] == False:
167
+ markdown_file = output_markdown(config)
168
+ else:
169
+ markdown_file = output_dir + f"/{outputname}.md"
170
+ doc_file = config['output_dir'] + f"/{outputname}.docx"
171
+ os.system(f'pandoc -i "{markdown_file}" -o "{doc_file}"')
172
+ print(f"...wrote {markdown_file}")
173
+
174
+ if config['markdown'] == False:
175
+ os.remove(f'{output_dir}/{outputname}.md')
176
+
177
+ return doc_file
178
+
179
+ def otheroutputs(transcript, csv=True, markdown=True, docx=True, upload_name="input.mp3"):
180
+ config = {}
181
+ now = datetime.datetime.now().replace(microsecond=0).isoformat().split("T")[0]
182
+ outputname = now + "-" + upload_name.split('.')[0]
183
+ output_dir = tempfile.mkdtemp()
184
+ files = []
185
+ config['input_name'] = upload_name
186
+ config['outputname'] = outputname
187
+ config['output_dir'] = output_dir
188
+ config['transcript'] = transcript
189
+ config['markdown'] = markdown
190
+ config['template'] = None # Placeholder to pass through custom jinja templates at a later date
191
+ if csv:
192
+ files.append(output_csv(config))
193
+ if markdown:
194
+ files.append(output_markdown(config))
195
+ if docx:
196
+ files.append(output_docx(config))
197
+ config['files'] = files
198
+ return config