File size: 10,509 Bytes
79b94f8
 
 
 
 
 
 
2b58524
79b94f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b58524
79b94f8
 
 
 
 
 
 
 
 
 
 
2b58524
 
 
 
 
 
 
 
 
79b94f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# For downloading from youtube and transcribing audio
from pytube import YouTube
from moviepy.editor import * 
from pydub import AudioSegment
from pydub.utils import make_chunks
import pydub
from pathlib import Path
import subprocess

# For getting text from PDF
from zipfile import ZipFile
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

# For transcription
import openai, whisper, torch
from faster_whisper import WhisperModel
import tiktoken
from nltk import tokenize

# For other stuff
import os, re
import time, math

# USEFUL CONSTANTS

# Duration is set to 6 minutes = 360 seconds = 360000 milliseconds
DURATION = 360000

# Maximum audio file size is 18MB
MAX_FILE_SIZE_BYTES = 18000000

# The model to use for transcription
WHISPER_MODEL = "tiny"
MODEL_SIZE = "base"

class DownloadAudio:
    """Downloads the audio from a youtube video and saves it to multiple .wav files in the specified folder"""

    def __init__(self, link) -> None:
        self.link = link
        self.yt = YouTube(self.link)
        self.YOUTUBE_VIDEO_ID = link.split("=")[1]
        self.WAV_FILE_NAME = f"{self.YOUTUBE_VIDEO_ID}.wav"

    def get_yt_title(self) -> str:
        """Returns the title of the youtube video"""
        while True:
            try:
                title = self.yt.title
                return title
            except:
                print("Failed to get name. Retrying...")
                time.sleep(1)
                self.yt = YouTube(self.link)
                continue

    def download(self, pathname:str) -> str:
        """
        Download the audio from the youtube video and saves it to multiple .wav files
        in the specified folder. Returns a list of the paths to the .wav files.
        """

        # Check if the folder for the VIDEO_ID exists
        if not os.path.exists(pathname):
            os.mkdir(pathname)
        FINAL_WAV_PATH = f"{pathname}/{self.WAV_FILE_NAME}"

        if not os.path.exists(FINAL_WAV_PATH):
            print("\n\n\n DOWNLOADING AUDIO \n\n\n")
            current_dir = os.getcwd()
            print(current_dir)
            executable_path = os.path.join(current_dir, "exec/yt-dlp_linux")
            # Download the video as an audio file using youtube-dl
            result = subprocess.run([executable_path, "-x", "--audio-format", "wav", "-o", FINAL_WAV_PATH, self.link])
            if result.returncode != 0:
                print("Failed to download audio. Retrying...")
                return "FAILED"

        # Load the input .wav file
        audio = AudioSegment.from_wav(FINAL_WAV_PATH)
    
        # Get the duration of the input file in milliseconds
        total_byte_size = os.path.getsize(FINAL_WAV_PATH)
        
        # If the total duration is less than the duration of each segment,
        # then just return the original file
        if total_byte_size < MAX_FILE_SIZE_BYTES:
            return FINAL_WAV_PATH

        # Get the size of the wav file
        channels = audio.channels
        sample_width = audio.sample_width
        duration_in_sec = math.ceil(len(audio) / 1000)
        sample_rate = audio.frame_rate
        bit_rate = sample_width * 8
        wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8

        # Get the length of each chunk in milliseconds and make the chunks
        chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size)   #in sec
        chunk_length_ms = chunk_length_in_sec * 1000
        chunks = make_chunks(audio, chunk_length_ms)

        # Export all of the individual chunks as wav files
        chunk_names = []
        for i, chunk in enumerate(chunks):
            chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
            output_chunk_path = f"{pathname}/{chunk_name}"
            chunk_names.append(output_chunk_path)
            chunk.export(f"{output_chunk_path}", format="wav")
        
        return FINAL_WAV_PATH


class VideoTranscription:
    """Performs transcription on a PDF or a link to a youtube video"""

    def __init__(self, datalink) -> None:
        self.datalink = datalink
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
        openai.api_key = os.environ.get("OPENAI_API_KEY")
        
    def transcribe(self) -> dict:
        """Returns the transcription of the PDF or youtube video as a string"""

        start_time = time.time()
        if self.datalink.startswith("http"):
            transcript = self.get_text_from_link()
        else:
            transcript = self.get_text_from_pdf()
        end_time = time.time()
        print(f"transcription took {end_time - start_time} seconds")
        return transcript

    def get_text_from_link(self) -> dict:

        # Get the names of the stored wav files
        YOUTUBE_VIDEO_ID = self.datalink.split("=")[1]
        FOLDER_NAME = f"./tests/{YOUTUBE_VIDEO_ID}"

        # Get the audio file
        audio_file = DownloadAudio(self.datalink)

        # Get the names of the stored wav files
        original_file_name = audio_file.download(FOLDER_NAME)
        print(original_file_name)
        # Get the transcription of each audio chunk
        text_transcriptions = ""
        # for file_name in file_names:
        # Get the transcription
        chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
        for chunk_segment in chunk_segments:
            text_transcriptions += chunk_segment.text.replace("$", "\$")    

        # Tokenize each sentence of the transcription. 
        sentences = tokenize.sent_tokenize(text_transcriptions)
        segments = []
        for i, sentence in enumerate(sentences):
            segment = {
                "id":i,
                "text":sentence,
                "tokens":self.encoding.encode(sentence)
            }
            segments.append(segment)
        
        final_transcription = {
            "title": audio_file.get_yt_title(),
            "text": text_transcriptions,
            "segments": segments
        }

        return final_transcription


class AudioTranscription:
    """Performs transcription on a MP3 file"""

    def __init__(self, audio_file) -> None:
        self.file = audio_file
        self.title = self.file.name
        self.folder_name = f"./tests/{self.title}".replace(' ', '')
        self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
        openai.api_key = os.environ.get("OPENAI_API_KEY")
        
    def get_redacted_name(self):
        return self.folder_name
        
    def transcribe(self) -> dict:
        """Returns the transcription of the MP3 audio as a string"""

        start_time = time.time()
        if not os.path.exists(self.folder_name):
            os.mkdir(self.folder_name)
        
        if self.title.endswith('wav'):
            audio = pydub.AudioSegment.from_wav(self.file)
            file_type = 'wav'
        elif self.title.endswith('mp3'):
            audio = pydub.AudioSegment.from_mp3(self.file)
            file_type = 'mp3'

        save_path = Path(self.folder_name) / self.file.name
        audio.export(save_path, format=file_type)
        final_wav_path = save_path
        
        if file_type == 'mp3':
            sound = AudioSegment.from_mp3(save_path)
            final_wav_path = self.folder_name + "/" +  self.title[:-4]+'.wav'
            sound.export(final_wav_path, format="wav")
        
        chunk_segments, info = self.model.transcribe(final_wav_path, beam_size=5)
        text_transcriptions = ""
        for chunk_segment in chunk_segments:
            text_transcriptions += chunk_segment.text.replace("$", "\$")    

        # Tokenize each sentence of the transcription. 
        sentences = tokenize.sent_tokenize(text_transcriptions)
        segments = []
        for i, sentence in enumerate(sentences):
            segment = {
                "id":i,
                "text":sentence,
                "tokens":self.encoding.encode(sentence)
            }
            segments.append(segment)
        
        final_transcription = {
            "title": self.title,
            "text": text_transcriptions,
            "segments": segments
        }
        end_time = time.time()
        print(f"transcription took {end_time - start_time} seconds")

        return final_transcription

def convert_pdf_to_txt_pages(path):
    texts = []
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    size = 0
    c = 0
    file_pages = PDFPage.get_pages(path)
    nbPages = len(list(file_pages))
    
    for page in PDFPage.get_pages(path):
        interpreter.process_page(page)
        t = retstr.getvalue()     
        if c == 0:
            texts.append(t)
        else:
            texts.append(t[size:])
        c = c + 1
        size = len(t)
        
    device.close()
    retstr.close()
    return texts, nbPages    
    
class PDFTranscription:
    
    def __init__(self, pdf_file):
        self.file = pdf_file
        self.title = pdf_file.name
        self.folder_name = f"./tests/{self.title}".replace(' ', '')
        self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        
    def get_redacted_name(self):
        return self.folder_name
    
    def transcribe(self):
        text, nbpages = convert_pdf_to_txt_pages(self.file)
        pdf_transcription = ''.join(text)
        
        sentences = tokenize.sent_tokenize(pdf_transcription)
        segments = []
        for i, sentence in enumerate(sentences):
            segment = {
                "id":i,
                "text":sentence,
                "tokens":self.encoding.encode(sentence)
            }
            
            segments.append(segment)
        
        final_transcription = {
            "title":self.title,
            "text":pdf_transcription,
            "segments":segments
        }        
        return final_transcription