| import gradio as gr | |
| from msclap import CLAP | |
| clap_model = CLAP(version = 'clapcap', use_cuda=False) | |
| def clap_inference(mic=None, file=None): | |
| if mic is not None: | |
| audio = mic | |
| elif file is not None: | |
| audio = file | |
| else: | |
| return "You must either provide a mic recording or a file" | |
| # Generate captions for the recording | |
| captions = clap_model.generate_caption([audio], | |
| resample=True, | |
| beam_size=5, | |
| entry_length=67, | |
| temperature=0.01) | |
| return captions[0] | |
| def create_app(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # CLAP demo for automatic audio captioning | |
| """ | |
| ) | |
| gr.Interface( | |
| fn=clap_inference, | |
| inputs=[ | |
| gr.Audio(sources="microphone", type="filepath"), | |
| gr.Audio(sources="upload", type="filepath"), | |
| ], | |
| outputs="text", | |
| ) | |
| return demo | |
| def main(): | |
| app = create_app() | |
| app.launch(debug=True) | |
| if __name__ == "__main__": | |
| main() | |