tomiwa1a
/

video-search

Automatic Speech Recognition

endpoints-template

Inference Endpoints

Model card Files Files and versions Community

tomiwa1a commited on Dec 24, 2022

Commit

97023c4

·

1 Parent(s): c8f6664

add video info to transcript

Files changed (1) hide show

handler.py +14 -4

handler.py CHANGED Viewed

@@ -42,18 +42,28 @@ class EndpointHandler():
             "verbose": True
         }
         yt = pytube.YouTube(video_url)
         stream = yt.streams.filter(only_audio=True)[0]
         path_to_audio = f"{yt.video_id}.mp3"
         stream.download(filename=path_to_audio)
         t0 = time.time()
         transcript = self.model.transcribe(path_to_audio, **decode_options)
         t1 = time.time()
-        total = t1-t0
-        print(f'Finished transcription in {total} seconds')
         for segment in transcript['segments']:
-          # Remove the tokens array, it was making response too verbose
           segment.pop('tokens', None)
         # postprocess the prediction
-        return {"transcript": transcript}

             "verbose": True
         }
         yt = pytube.YouTube(video_url)
+        video_info = {
+            'id': yt.video_id,
+            'thumbnail': yt.thumbnail_url,
+            'title': yt.title,
+            'views': yt.views,
+            'length': yt.length,
+            # Althhough, this might seem redundant since we already have id
+            # but it allows the link to the video be accessed in 1-click in the API response
+            'url': f"https://www.youtube.com/watch?v={yt.video_id}"
+        }
         stream = yt.streams.filter(only_audio=True)[0]
         path_to_audio = f"{yt.video_id}.mp3"
         stream.download(filename=path_to_audio)
         t0 = time.time()
         transcript = self.model.transcribe(path_to_audio, **decode_options)
         t1 = time.time()
         for segment in transcript['segments']:
+          # Remove the tokens array, it makes the response too verbose
           segment.pop('tokens', None)
+        total = t1-t0
+        print(f'Finished transcription in {total} seconds')
         # postprocess the prediction
+        return {"transcript": transcript, 'video': video_info}