File size: 5,582 Bytes
b8b135b
039ef51
c2110e8
55a586c
bba23d3
14c0ec2
e559d03
b8b135b
febac70
e559d03
8549c9b
e559d03
 
 
 
 
 
 
 
 
 
 
febac70
 
 
 
 
 
 
7e445fd
4ca7369
febac70
4ca7369
febac70
 
 
 
 
 
 
 
 
 
 
55a586c
 
 
febac70
 
 
e559d03
77fc3c3
accb4e2
55a586c
febac70
55a586c
77fc3c3
bba23d3
77fc3c3
 
55a586c
c2110e8
accb4e2
febac70
55a586c
 
febac70
 
 
55a586c
febac70
14c0ec2
febac70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e48b02
febac70
 
 
55a586c
c1335fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
import subprocess
import torch
from charts import spider_chart
from dictionaries import calculate_average, transform_dict
from icon import generate_icon
from transformers import pipeline
from timestamp import format_timestamp
from youtube import get_youtube_video_id

MODEL_NAME = "openai/whisper-medium"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

#Formating
title = "Whisper Demo: Transcribe Audio"
MODEL_NAME1 = "jpdiazpardo/whisper-tiny-metal"
description = ("Transcribe long-form audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME1}](https://huggingface.co/{MODEL_NAME1}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length. Check some of the 'cool' examples below")

examples = [["https://www.youtube.com/watch?v=W72Lnz1n-jw&ab_channel=Whitechapel-Topic",None,None,
            "examples/When a Demon Defiles a Witch.wav",True, True],
            ["https://www.youtube.com/watch?v=BnO3Io0KOl4&ab_channel=MotionlessInWhite-Topic",None,None,
             "examples/Immaculate Misconception.wav",True, True]]

linkedin = generate_icon("linkedin")
github = generate_icon("github")
article = ("<div style='text-align: center; max-width:800px; margin:10px auto;'>"
            f"<p>{linkedin} <a href='https://www.linkedin.com/in/juanpablodiazp/' target='_blank'>Juan Pablo Díaz Pardo</a><br>"
            f"{github} <a href='https://github.com/jpdiazpardo' target='_blank'>jpdiazpardo</a></p>")

title = "Scream: Fine-Tuned Whisper model for automatic gutural speech recognition 🤟🤟🤟"

#-------------------------------------------------------------------------------------------------------------------------------

#Define classifier for sentiment analysis
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)

def transcribe(*args):#file, return_timestamps, *kwargs):
    '''inputs: file, return_timestamps'''
    outputs = pipe(args[3], batch_size=BATCH_SIZE, generate_kwargs={"task": 'transcribe'}, return_timestamps=True)
    text = outputs["text"]
    timestamps = outputs["chunks"]

    #If return timestamps is True, return html text with timestamps format
    if args[4]==True:
      spider_text = [f"{chunk['text']}" for chunk in timestamps] #Text for spider chart without timestamps
      timestamps = [f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}" for chunk in timestamps]

    else:
      timestamps = [f"{chunk['text']}" for chunk in timestamps]
      spider_text = timestamps

    text = "<br>".join(str(feature) for feature in timestamps)
    text = f"<h4>Transcription</h4><div style='overflow-y: scroll; height: 150px;'>{text}</div>"

    spider_text = "\n".join(str(feature) for feature in spider_text)
    trans_dict=[transform_dict(classifier.predict(t)[0]) for t in spider_text.split("\n")]
    av_dict = calculate_average(trans_dict)
    fig = spider_chart(av_dict)
    
    return args[3], text, fig, av_dict

def filter(choice):
    if choice=="YouTube":
        return yt_link.update(interactive=True), audio_input.update(interactive=False)
    elif choice == "Upload File":
        return yt_link.update(value=None,interactive=False), audio_input.update(interactive=True)
    else:
        return yt_link.update(interactive=False), audio_input.update(interactive=False)


embed_html = '<iframe src="https://www.youtube.com/embed/YOUTUBE_ID'\
              'title="YouTube video player" frameborder="0" allow="accelerometer;'\
              'autoplay; clipboard-write; encrypted-media; gyroscope;'\
              'picture-in-picture" allowfullscreen></iframe>'

def download(link):
    subprocess.run(['python3', 'youtubetowav.py', link])
    return thumbnail.update(value=embed_html.replace("YOUTUBE_ID",get_youtube_video_id(link)), visible=True)

def hide_sa(value):
  if value == True:
    return sa_plot.update(visible=True), sa_frequency.update(visible=True)
  else:
    return sa_plot.update(visible=False), sa_frequency.update(visible=False)
  
#Input components        
yt_link = gr.Textbox(value=None,label="YouTube link", info = "Optional: Copy and paste YouTube URL")
audio_input = gr.Audio(source="upload", type="filepath", label="Upload audio file for transcription")
download_button = gr.Button("Download")
thumbnail = gr.HTML(value=embed_html, visible=False)
sa_checkbox = gr.Checkbox(value=True, label="Sentiment analysis")

inputs = [yt_link, #0
          download_button, #1
          thumbnail, #2
          audio_input, #3
          gr.Checkbox(value=True, label="Return timestamps"), #4
          sa_checkbox] #5

#Ouput components
audio_out = gr.Audio(label="Processed Audio", type="filepath", info = "Vocals only") 
sa_plot = gr.Plot(label="Sentiment Analysis")
sa_frequency = gr.Label(label="Frequency")

outputs = [audio_out, gr.outputs.HTML("text"), sa_plot, sa_frequency]

with gr.Blocks() as demo:
  download_button.click(download, inputs=[yt_link], outputs=[thumbnail])
  sa_checkbox.change(hide_sa, inputs=[sa_checkbox], outputs=[sa_plot, sa_frequency])


  with gr.Column():
      gr.Interface(title = title, fn=transcribe, inputs = inputs, outputs = outputs, 
                   description=description, cache_examples=True, allow_flagging="never", article = article , examples=None)
            
demo.queue(concurrency_count=3)
demo.launch(debug = True)