Yasaman commited on
Commit
b19e249
1 Parent(s): 97a29e1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1BLa-ng23vT9TfwY5G535Y20WSO76Z3FD
8
+ """
9
+
10
+ import torch
11
+
12
+ import gradio as gr
13
+ import pytube as pt
14
+ from transformers import pipeline
15
+
16
+ asr = pipeline(
17
+ task="automatic-speech-recognition",
18
+ model="Yasaman/whisper_fa",
19
+ chunk_length_s=30,
20
+ device="cpu",
21
+ )
22
+
23
+ summarizer = pipeline(
24
+ "summarization",
25
+ model="alireza7/PEGASUS-persian-base-PN-summary",
26
+ )
27
+
28
+ translator = pipeline(
29
+ "translation",
30
+ model="Helsinki-NLP/opus-mt-iir-en")
31
+
32
+ def transcribe(microphone, file_upload):
33
+ warn_output = ""
34
+ if (microphone is not None) and (file_upload is not None):
35
+ warn_output = (
36
+ "WARNING: You've uploaded an audio file and used the microphone. "
37
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
38
+ )
39
+
40
+ elif (microphone is None) and (file_upload is None):
41
+ return "ERROR: You have to either use the microphone or upload an audio file"
42
+
43
+ file = microphone if microphone is not None else file_upload
44
+
45
+ text = asr(file)["text"]
46
+
47
+ translate = translator(text)
48
+ translate = translate[0]["translation_text"]
49
+
50
+ return warn_output + text, translate
51
+
52
+ def _return_yt_html_embed(yt_url):
53
+ video_id = yt_url.split("?v=")[-1]
54
+ HTML_str = (
55
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
56
+ " </center>"
57
+ )
58
+ return HTML_str
59
+
60
+
61
+ def yt_transcribe(yt_url):
62
+ yt = pt.YouTube(yt_url)
63
+ html_embed_str = _return_yt_html_embed(yt_url)
64
+ stream = yt.streams.filter(only_audio=True)[0]
65
+ stream.download(filename="audio.mp3")
66
+
67
+ text = asr("audio.mp3")["text"]
68
+
69
+ summary = summarizer(text)
70
+ summary = summary[0]["summary_text"]
71
+
72
+ translate = translator(summary)
73
+ translate = translate[0]["translation_text"]
74
+
75
+ return html_embed_str, text, summary, translate
76
+
77
+ demo = gr.Blocks()
78
+
79
+ mf_transcribe = gr.Interface(
80
+ fn=transcribe,
81
+ inputs=[
82
+ gr.inputs.Audio(source="microphone", type="filepath", optional=True),
83
+ gr.inputs.Audio(source="upload", type="filepath", optional=True),
84
+ ],
85
+ outputs=[
86
+ gr.Textbox(label="Transcribed text"),
87
+ gr.Textbox(label="Translated text"),
88
+ ],
89
+ layout="horizontal",
90
+ theme="huggingface",
91
+ title="Whisper Demo: Transcribe and Translate Persian Audio",
92
+ description=(
93
+ "Transcribe and Translate long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
94
+ f" [Yasaman/whisper_fa](https://huggingface.co/Yasaman/whisper_fa) and 🤗 Transformers to transcribe audio files"
95
+ " of arbitrary length. It also uses another model for the translation."
96
+ ),
97
+ allow_flagging="never",
98
+ )
99
+
100
+ yt_transcribe = gr.Interface(
101
+ fn=yt_transcribe,
102
+ inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
103
+ outputs=["html",
104
+ gr.Textbox(label="Transcribed text"),
105
+ gr.Textbox(label="Summarized text"),
106
+ gr.Textbox(label="Translated text"),
107
+ ],
108
+ layout="horizontal",
109
+ theme="huggingface",
110
+ title="Whisper Demo: Transcribe, Summarize and Translate YouTube",
111
+ description=(
112
+ "Transcribe, Summarize and Translate long-form YouTube videos with the click of a button! Demo uses the the fine-tuned "
113
+ f" [Yasaman/whisper_fa](https://huggingface.co/Yasaman/whisper_fa) and 🤗 Transformers to transcribe audio files of"
114
+ " arbitrary length. It also uses other two models to first summarize and then translate the text input. You can try with the following example: "
115
+ f" [Video1](https://www.youtube.com/watch?v=qtRzP3KvQZk)"
116
+ ),
117
+ allow_flagging="never",
118
+ )
119
+
120
+ with demo:
121
+ gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe and Translate Audio", "Transcribe, Summarize and Translate YouTube"])
122
+
123
+ demo.launch(enable_queue=True)