yizhilll commited on
Commit
5247bff
1 Parent(s): 1eaf59a

add demo loadin code

Browse files
Files changed (2) hide show
  1. __pycache__/app.cpython-310.pyc +0 -0
  2. app.py +73 -4
__pycache__/app.cpython-310.pyc ADDED
Binary file (1.76 kB). View file
 
app.py CHANGED
@@ -1,7 +1,76 @@
1
  import gradio as gr
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import Wav2Vec2FeatureExtractor
3
+ from transformers import AutoModel
4
+ import torch
5
+ from torch import nn
6
+ import torchaudio
7
+ import torchaudio.transforms as T
8
 
9
+ # input cr: https://huggingface.co/spaces/thealphhamerc/audio-to-text/blob/main/app.py
 
10
 
11
+ inputs = [gr.components.Audio(type="filepath", label="Add music audio file"),
12
+ gr.inputs.Audio(source="microphone",optional=True, type="filepath"),
13
+ ]
14
+ outputs = [gr.components.Textbox()]
15
+ # outputs = [gr.components.Textbox(), transcription_df]
16
+ title = "Output the tags of a (music) audio"
17
+ description = "An example of using MERT-95M-public to conduct music tagging."
18
+ article = ""
19
+ audio_examples = [
20
+ ["input/example-1.wav"],
21
+ ["input/example-2.wav"],
22
+ ]
23
+
24
+ # Load the model
25
+ model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True)
26
+ # loading the corresponding preprocessor config
27
+ processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)
28
+
29
+
30
+ def convert_audio(inputs, microphone):
31
+ if (microphone is not None):
32
+ inputs = microphone
33
+
34
+ waveform, sample_rate = torchaudio.load(inputs)
35
+
36
+
37
+ resample_rate = processor.sampling_rate
38
+
39
+ # make sure the sample_rate aligned
40
+ if resample_rate != sample_rate:
41
+ print(f'setting rate from {sample_rate} to {resample_rate}')
42
+ resampler = T.Resample(sample_rate, resample_rate)
43
+ waveform = resampler(waveform)
44
+
45
+ inputs = processor(waveform, sampling_rate=resample_rate, return_tensors="pt")
46
+ with torch.no_grad():
47
+ outputs = model(**inputs, output_hidden_states=True)
48
+
49
+ # take a look at the output shape, there are 13 layers of representation
50
+ # each layer performs differently in different downstream tasks, you should choose empirically
51
+ all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
52
+ # print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]
53
+ return str(all_layer_hidden_states.shape)
54
+
55
+
56
+ # iface = gr.Interface(fn=convert_audio, inputs="audio", outputs="text")
57
+ # iface.launch()
58
+
59
+ audio_chunked = gr.Interface(
60
+ fn=convert_audio,
61
+ inputs=inputs,
62
+ outputs=outputs,
63
+ allow_flagging="never",
64
+ title=title,
65
+ description=description,
66
+ article=article,
67
+ examples=audio_examples,
68
+ )
69
+
70
+
71
+ demo = gr.Blocks()
72
+ with demo:
73
+ gr.TabbedInterface([audio_chunked], [
74
+ "Audio File"])
75
+ # demo.queue(concurrency_count=1, max_size=5)
76
+ demo.launch(show_api=False)