Spaces:

datasciencesage
/

AudioModels

Sleeping

App Files Files Community

AudioModels / app.py

datasciencesage

app.py

8fa04cd 10 months ago

raw

history blame contribute delete

13.4 kB

	# import os
	# os.environ["KERAS_BACKEND"] = "jax"
	# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
	# import logging
	# from pathlib import Path
	# import numpy as np
	# import librosa
	# import tensorflow_hub as hub
	# from flask import Flask, render_template, request, jsonify, session
	# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	# import keras
	# import torch
	# from werkzeug.utils import secure_filename
	# import traceback

	# # Configure logging
	# logging.basicConfig(
	# level=logging.INFO,
	# format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	# handlers=[
	# logging.FileHandler('app.log'),
	# logging.StreamHandler()
	# ]
	# )
	# logger = logging.getLogger(__name__)

	# # Environment setup


	# class AudioProcessor:
	# _instance = None
	# _initialized = False

	# def __new__(cls):
	# if cls._instance is None:
	# cls._instance = super(AudioProcessor, cls).__new__(cls)
	# return cls._instance

	# def __init__(self):
	# if not AudioProcessor._initialized:
	# self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
	# self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	# self.initialize_models()
	# AudioProcessor._initialized = True

	# def initialize_models(self):
	# try:
	# logger.info("Initializing models...")
	# # Initialize transcription model
	# model_id = "distil-whisper/distil-large-v3"
	# self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	# model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	# )
	# self.transcription_model.to(self.device)
	# self.processor = AutoProcessor.from_pretrained(model_id)

	# # Initialize classification model
	# self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification")

	# # Initialize pipeline
	# self.pipe = pipeline(
	# "automatic-speech-recognition",
	# model=self.transcription_model,
	# tokenizer=self.processor.tokenizer,
	# feature_extractor=self.processor.feature_extractor,
	# max_new_tokens=128,
	# chunk_length_s=25,
	# batch_size=16,
	# torch_dtype=self.torch_dtype,
	# device=self.device,
	# )

	# # Initialize YAMNet model
	# self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

	# logger.info("Models initialized successfully")
	# except Exception as e:
	# logger.error(f"Error initializing models: {str(e)}")
	# raise

	# def load_wav_16k_mono(self, filename):
	# try:
	# wav, sr = librosa.load(filename, mono=True, sr=None)
	# if sr != 16000:
	# wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
	# return wav
	# except Exception as e:
	# logger.error(f"Error loading audio file: {str(e)}")
	# raise

	# def get_features_yamnet_extract_embedding(self, wav_data):
	# try:
	# scores, embeddings, spectrogram = self.yamnet_model(wav_data)
	# return np.mean(embeddings.numpy(), axis=0)
	# except Exception as e:
	# logger.error(f"Error extracting YAMNet embeddings: {str(e)}")
	# raise

	# # Initialize Flask application
	# app = Flask(__name__)
	# app.secret_key = 'your_secret_key_here'
	# app.config['UPLOAD_FOLDER'] = Path('uploads')
	# app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024

	# # Create upload folder
	# app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True)

	# # Initialize audio processor (will only happen once)
	# audio_processor = AudioProcessor()

	# @app.route('/')
	# def index():
	# session.clear()
	# return render_template('terminal.html')

	# @app.route('/process', methods=['POST'])
	# def process():
	# try:
	# data = request.json
	# command = data.get('command', '').strip().lower()

	# if command in ['classify', 'transcribe']:
	# session['operation'] = command
	# return jsonify({
	# 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.',
	# 'upload': True
	# })
	# else:
	# return jsonify({
	# 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
	# })
	# except Exception as e:
	# logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}")
	# session.pop('operation', None)
	# return jsonify({'result': f'root@math:~$ Error: {str(e)}'})

	# @app.route('/upload', methods=['POST'])
	# def upload():
	# filepath = None
	# try:
	# operation = session.get('operation')
	# if not operation:
	# return jsonify({
	# 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".'
	# })

	# if 'file' not in request.files:
	# return jsonify({'result': 'root@math:~$ No file uploaded.'})

	# file = request.files['file']
	# if file.filename == '' or not file.filename.lower().endswith('.mp3'):
	# return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})

	# filename = secure_filename(file.filename)
	# filepath = app.config['UPLOAD_FOLDER'] / filename

	# file.save(filepath)
	# wav_data = audio_processor.load_wav_16k_mono(filepath)

	# if operation == 'classify':
	# embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
	# embeddings = np.reshape(embeddings, (-1, 1024))
	# result = np.argmax(audio_processor.classification_model.predict(embeddings))
	# elif operation == 'transcribe':
	# result = audio_processor.pipe(str(filepath))['text']
	# else:
	# result = 'Invalid operation'

	# return jsonify({
	# 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".',
	# 'upload': False
	# })

	# except Exception as e:
	# logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}")
	# return jsonify({
	# 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
	# })
	# finally:
	# session.pop('operation', None)
	# if filepath and Path(filepath).exists():
	# try:
	# Path(filepath).unlink()
	# except Exception as e:
	# logger.error(f"Error deleting file {filepath}: {str(e)}")

	import os
	os.environ["KERAS_BACKEND"] = "jax"
	os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
	import logging
	import numpy as np
	import librosa
	import tensorflow_hub as hub
	from flask import Flask, render_template, request, jsonify, session
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import keras
	import torch
	import io
	import traceback

	# Configure logging to print to terminal only
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	class AudioProcessor:
	_instance = None
	_initialized = False

	def __new__(cls):
	if cls._instance is None:
	cls._instance = super(AudioProcessor, cls).__new__(cls)
	return cls._instance

	def __init__(self):
	if not AudioProcessor._initialized:
	self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
	self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	self.initialize_models()
	AudioProcessor._initialized = True

	def initialize_models(self):
	try:
	logger.info("Initializing models...")
	# Initialize transcription model
	model_id = "distil-whisper/distil-large-v3"
	self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	self.transcription_model.to(self.device)
	self.processor = AutoProcessor.from_pretrained(model_id)

	# Initialize classification model
	self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification")

	# Initialize pipeline
	self.pipe = pipeline(
	"automatic-speech-recognition",
	model=self.transcription_model,
	tokenizer=self.processor.tokenizer,
	feature_extractor=self.processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=25,
	batch_size=16,
	torch_dtype=self.torch_dtype,
	device=self.device,
	)

	# Initialize YAMNet model
	self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

	logger.info("Models initialized successfully")
	except Exception as e:
	logger.error(f"Error initializing models: {str(e)}")
	raise

	def load_wav_16k_mono(self, audio_data):
	try:
	# Load audio from bytes buffer instead of file
	wav, sr = librosa.load(io.BytesIO(audio_data), mono=True, sr=None)
	if sr != 16000:
	wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
	return wav
	except Exception as e:
	logger.error(f"Error loading audio data: {str(e)}")
	raise

	def get_features_yamnet_extract_embedding(self, wav_data):
	try:
	scores, embeddings, spectrogram = self.yamnet_model(wav_data)
	return np.mean(embeddings.numpy(), axis=0)
	except Exception as e:
	logger.error(f"Error extracting YAMNet embeddings: {str(e)}")
	raise

	# Initialize Flask application
	app = Flask(__name__)
	app.secret_key = 'your_secret_key_here'
	app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024

	# Initialize audio processor (will only happen once)
	audio_processor = AudioProcessor()

	@app.route('/')
	def index():
	session.clear()
	return render_template('terminal.html')

	@app.route('/process', methods=['POST'])
	def process():
	try:
	data = request.json
	command = data.get('command', '').strip().lower()

	if command in ['classify', 'transcribe']:
	session['operation'] = command
	return jsonify({
	'result': f'root@math:~$ Upload a .mp3 file for {command} operation.',
	'upload': True
	})
	else:
	return jsonify({
	'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
	})
	except Exception as e:
	logger.error(f"Error in process route: {str(e)}")
	session.pop('operation', None)
	return jsonify({'result': f'root@math:~$ Error: {str(e)}'})

	@app.route('/upload', methods=['POST'])
	def upload():
	try:
	operation = session.get('operation')
	if not operation:
	return jsonify({
	'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".'
	})

	if 'file' not in request.files:
	return jsonify({'result': 'root@math:~$ No file uploaded.'})

	file = request.files['file']
	if file.filename == '' or not file.filename.lower().endswith('.mp3'):
	return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})

	# Read file content into memory
	audio_data = file.read()
	wav_data = audio_processor.load_wav_16k_mono(audio_data)

	if operation == 'classify':
	embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
	embeddings = np.reshape(embeddings, (-1, 1024))
	result = np.argmax(audio_processor.classification_model.predict(embeddings))
	elif operation == 'transcribe':
	# Create temporary buffer for transcription
	audio_buffer = io.BytesIO(audio_data)
	result = audio_processor.pipe(audio_buffer)['text']
	else:
	result = 'Invalid operation'

	return jsonify({
	'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".',
	'upload': False
	})

	except Exception as e:
	logger.error(f"Error in upload route: {str(e)}")
	return jsonify({
	'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
	})
	finally:
	session.pop('operation', None)

	# if __name__ == '__main__':
	# app.run(host='0.0.0.0', port=7860)