Yasaman commited on
Commit
2387cd8
1 Parent(s): f36d3ff

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.ipynb +142 -0
  2. requirements.txt +4 -0
app.ipynb ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "id": "kPCLdTfJyktF"
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "import torch\n",
26
+ "\n",
27
+ "import gradio as gr\n",
28
+ "import pytube as pt\n",
29
+ "from transformers import pipeline\n",
30
+ "\n",
31
+ "asr = pipeline(\n",
32
+ " task=\"automatic-speech-recognition\",\n",
33
+ " model=\"Yasaman/whisper_fa\",\n",
34
+ " chunk_length_s=30,\n",
35
+ " device=\"cpu\",\n",
36
+ ")\n",
37
+ "\n",
38
+ "summarizer = pipeline(\n",
39
+ " \"summarization\",\n",
40
+ " model=\"alireza7/PEGASUS-persian-base-PN-summary\",\n",
41
+ ")\n",
42
+ "\n",
43
+ "translator = pipeline(\n",
44
+ " \"translation\", \n",
45
+ " model=\"Helsinki-NLP/opus-mt-iir-en\")\n",
46
+ "\n",
47
+ "def transcribe(microphone, file_upload):\n",
48
+ " warn_output = \"\"\n",
49
+ " if (microphone is not None) and (file_upload is not None):\n",
50
+ " warn_output = (\n",
51
+ " \"WARNING: You've uploaded an audio file and used the microphone. \"\n",
52
+ " \"The recorded file from the microphone will be used and the uploaded audio will be discarded.\\n\"\n",
53
+ " )\n",
54
+ "\n",
55
+ " elif (microphone is None) and (file_upload is None):\n",
56
+ " return \"ERROR: You have to either use the microphone or upload an audio file\"\n",
57
+ "\n",
58
+ " file = microphone if microphone is not None else file_upload\n",
59
+ "\n",
60
+ " text = asr(file)[\"text\"]\n",
61
+ "\n",
62
+ " translate = translator(text)\n",
63
+ " translate = translate[0][\"translation_text\"]\n",
64
+ "\n",
65
+ " return warn_output + text, translate\n",
66
+ "\n",
67
+ "def _return_yt_html_embed(yt_url):\n",
68
+ " video_id = yt_url.split(\"?v=\")[-1]\n",
69
+ " HTML_str = (\n",
70
+ " f'<center> <iframe width=\"500\" height=\"320\" src=\"https://www.youtube.com/embed/{video_id}\"> </iframe>'\n",
71
+ " \" </center>\"\n",
72
+ " )\n",
73
+ " return HTML_str\n",
74
+ "\n",
75
+ "\n",
76
+ "def yt_transcribe(yt_url):\n",
77
+ " yt = pt.YouTube(yt_url)\n",
78
+ " html_embed_str = _return_yt_html_embed(yt_url)\n",
79
+ " stream = yt.streams.filter(only_audio=True)[0]\n",
80
+ " stream.download(filename=\"audio.mp3\")\n",
81
+ "\n",
82
+ " text = asr(\"audio.mp3\")[\"text\"]\n",
83
+ "\n",
84
+ " summary = summarizer(text)\n",
85
+ " summary = summary[0][\"summary_text\"]\n",
86
+ " \n",
87
+ " translate = translator(summary)\n",
88
+ " translate = translate[0][\"translation_text\"]\n",
89
+ "\n",
90
+ " return html_embed_str, text, summary, translate\n",
91
+ "\n",
92
+ "demo = gr.Blocks()\n",
93
+ "\n",
94
+ "mf_transcribe = gr.Interface(\n",
95
+ " fn=transcribe,\n",
96
+ " inputs=[\n",
97
+ " gr.inputs.Audio(source=\"microphone\", type=\"filepath\", optional=True),\n",
98
+ " gr.inputs.Audio(source=\"upload\", type=\"filepath\", optional=True),\n",
99
+ " ],\n",
100
+ " outputs=[\n",
101
+ " gr.Textbox(label=\"Transcribed text\"),\n",
102
+ " gr.Textbox(label=\"Translated text\"),\n",
103
+ " ],\n",
104
+ " layout=\"horizontal\",\n",
105
+ " theme=\"huggingface\",\n",
106
+ " title=\"Whisper Demo: Transcribe and Translate Persian Audio\",\n",
107
+ " description=(\n",
108
+ " \"Transcribe and Translate long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned\"\n",
109
+ " f\" [Yasaman/whisper_fa](https://huggingface.co/Yasaman/whisper_fa) and 🤗 Transformers to transcribe audio files\"\n",
110
+ " \" of arbitrary length. It also uses another model for the translation.\"\n",
111
+ " ),\n",
112
+ " allow_flagging=\"never\",\n",
113
+ ")\n",
114
+ "\n",
115
+ "yt_transcribe = gr.Interface(\n",
116
+ " fn=yt_transcribe,\n",
117
+ " inputs=[gr.inputs.Textbox(lines=1, placeholder=\"Paste the URL to a YouTube video here\", label=\"YouTube URL\")],\n",
118
+ " outputs=[\"html\",\n",
119
+ " gr.Textbox(label=\"Transcribed text\"),\n",
120
+ " gr.Textbox(label=\"Summarized text\"),\n",
121
+ " gr.Textbox(label=\"Translated text\"),\n",
122
+ " ],\n",
123
+ " layout=\"horizontal\",\n",
124
+ " theme=\"huggingface\",\n",
125
+ " title=\"Whisper Demo: Transcribe, Summarize and Translate YouTube\",\n",
126
+ " description=(\n",
127
+ " \"Transcribe, Summarize and Translate long-form YouTube videos with the click of a button! Demo uses the the fine-tuned \"\n",
128
+ " f\" [Yasaman/whisper_fa](https://huggingface.co/Yasaman/whisper_fa) and 🤗 Transformers to transcribe audio files of\"\n",
129
+ " \" arbitrary length. It also uses other two models to first summarize and then translate the text input. You can try with the following example: \" \n",
130
+ " f\" [Video1](https://www.youtube.com/watch?v=qtRzP3KvQZk)\"\n",
131
+ " ),\n",
132
+ " allow_flagging=\"never\",\n",
133
+ ")\n",
134
+ "\n",
135
+ "with demo:\n",
136
+ " gr.TabbedInterface([mf_transcribe, yt_transcribe], [\"Transcribe and Translate Audio\", \"Transcribe, Summarize and Translate YouTube\"])\n",
137
+ "\n",
138
+ "demo.launch(enable_queue=True)"
139
+ ]
140
+ }
141
+ ]
142
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ pytube
4
+ sentencepiece