Spaces:

yellowcandle
/

whisper-v3-gradio

Sleeping

App Files Files Community

yellowcandle commited on Jun 18

Commit

4b18df1

•

1 Parent(s): 344a72e

Tried to add youtube video upload

Browse files

Files changed (1) hide show

app.py +27 -6

app.py CHANGED Viewed

@@ -1,9 +1,33 @@
 import spaces
 import gradio as gr
-# Use a pipeline as a high-level helper
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
 @spaces.GPU(duration=60)
 def transcribe_audio(audio, model_id):
     if audio is None:
@@ -36,7 +60,6 @@ def transcribe_audio(audio, model_id):
     result = pipe(audio)
     return result["text"]
-# @spaces.GPU(duration=180)
 def proofread(text):
     if text is None:
         return "Please provide the transcribed text for proofreading."
@@ -50,15 +73,13 @@ def proofread(text):
     tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
     model.to(device)
-    # Perform proofreading using the model
     input_text = prompt + text
     input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
-    output = model.generate(input_ids, max_length=len(input_ids[0])+50, num_return_sequences=1, temperature=0.7)
     proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
     return proofread_text
 with gr.Blocks() as demo:
     gr.Markdown("""
                 # Audio Transcription and Proofreading
@@ -70,7 +91,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             audio = gr.Audio(sources="upload", type="filepath")
-            video = gr.Video(sources="upload")
         model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
     transcribe_button = gr.Button("Transcribe")

 import spaces
 import gradio as gr
+import os
+import logging
+from pytube import YouTube
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
+def get_text(url):
+    if url != '':
+        output_text_transcribe = ''
+    yt = YouTube(url)
+    video = yt.streams.filter(only_audio=True).first()
+    out_file = video.download(output_path=".")
+    file_stats = os.stat(out_file)
+    logging.info(f'Size of audio file in Bytes: {file_stats.st_size}')
+    if file_stats.st_size <= 30000000:
+        base, ext = os.path.splitext(out_file)
+        new_file = base + '.mp3'
+        os.rename(out_file, new_file)
+        a = new_file
+        result = model.transcribe(a)
+        return result['text'].strip()
+    else:
+        logging.error('Videos for transcription on this space are limited to about 1.5 hours. Sorry about this limit but some joker thought they could stop this tool from working by transcribing many extremely long videos. Please visit https://steve.digital to contact me about this space.')
 @spaces.GPU(duration=60)
 def transcribe_audio(audio, model_id):
     if audio is None:
     result = pipe(audio)
     return result["text"]
 def proofread(text):
     if text is None:
         return "Please provide the transcribed text for proofreading."
     tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
     model.to(device)
     input_text = prompt + text
     input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    output = model.generate(input_ids, max_length=len(input_ids[0]) + 50, num_return_sequences=1, temperature=0.7)
     proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
     return proofread_text
 with gr.Blocks() as demo:
     gr.Markdown("""
                 # Audio Transcription and Proofreading
     with gr.Row():
         with gr.Column():
             audio = gr.Audio(sources="upload", type="filepath")
+            input_text_url = gr.Textbox(label="Video URL")
         model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
     transcribe_button = gr.Button("Transcribe")