Spaces:
Sleeping
Sleeping
| # import os | |
| # os.environ["KERAS_BACKEND"] = "jax" | |
| # os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' | |
| # import logging | |
| # from pathlib import Path | |
| # import numpy as np | |
| # import librosa | |
| # import tensorflow_hub as hub | |
| # from flask import Flask, render_template, request, jsonify, session | |
| # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| # import keras | |
| # import torch | |
| # from werkzeug.utils import secure_filename | |
| # import traceback | |
| # # Configure logging | |
| # logging.basicConfig( | |
| # level=logging.INFO, | |
| # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| # handlers=[ | |
| # logging.FileHandler('app.log'), | |
| # logging.StreamHandler() | |
| # ] | |
| # ) | |
| # logger = logging.getLogger(__name__) | |
| # # Environment setup | |
| # class AudioProcessor: | |
| # _instance = None | |
| # _initialized = False | |
| # def __new__(cls): | |
| # if cls._instance is None: | |
| # cls._instance = super(AudioProcessor, cls).__new__(cls) | |
| # return cls._instance | |
| # def __init__(self): | |
| # if not AudioProcessor._initialized: | |
| # self.device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| # self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # self.initialize_models() | |
| # AudioProcessor._initialized = True | |
| # def initialize_models(self): | |
| # try: | |
| # logger.info("Initializing models...") | |
| # # Initialize transcription model | |
| # model_id = "distil-whisper/distil-large-v3" | |
| # self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| # model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
| # ) | |
| # self.transcription_model.to(self.device) | |
| # self.processor = AutoProcessor.from_pretrained(model_id) | |
| # # Initialize classification model | |
| # self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification") | |
| # # Initialize pipeline | |
| # self.pipe = pipeline( | |
| # "automatic-speech-recognition", | |
| # model=self.transcription_model, | |
| # tokenizer=self.processor.tokenizer, | |
| # feature_extractor=self.processor.feature_extractor, | |
| # max_new_tokens=128, | |
| # chunk_length_s=25, | |
| # batch_size=16, | |
| # torch_dtype=self.torch_dtype, | |
| # device=self.device, | |
| # ) | |
| # # Initialize YAMNet model | |
| # self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') | |
| # logger.info("Models initialized successfully") | |
| # except Exception as e: | |
| # logger.error(f"Error initializing models: {str(e)}") | |
| # raise | |
| # def load_wav_16k_mono(self, filename): | |
| # try: | |
| # wav, sr = librosa.load(filename, mono=True, sr=None) | |
| # if sr != 16000: | |
| # wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) | |
| # return wav | |
| # except Exception as e: | |
| # logger.error(f"Error loading audio file: {str(e)}") | |
| # raise | |
| # def get_features_yamnet_extract_embedding(self, wav_data): | |
| # try: | |
| # scores, embeddings, spectrogram = self.yamnet_model(wav_data) | |
| # return np.mean(embeddings.numpy(), axis=0) | |
| # except Exception as e: | |
| # logger.error(f"Error extracting YAMNet embeddings: {str(e)}") | |
| # raise | |
| # # Initialize Flask application | |
| # app = Flask(__name__) | |
| # app.secret_key = 'your_secret_key_here' | |
| # app.config['UPLOAD_FOLDER'] = Path('uploads') | |
| # app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 | |
| # # Create upload folder | |
| # app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True) | |
| # # Initialize audio processor (will only happen once) | |
| # audio_processor = AudioProcessor() | |
| # @app.route('/') | |
| # def index(): | |
| # session.clear() | |
| # return render_template('terminal.html') | |
| # @app.route('/process', methods=['POST']) | |
| # def process(): | |
| # try: | |
| # data = request.json | |
| # command = data.get('command', '').strip().lower() | |
| # if command in ['classify', 'transcribe']: | |
| # session['operation'] = command | |
| # return jsonify({ | |
| # 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.', | |
| # 'upload': True | |
| # }) | |
| # else: | |
| # return jsonify({ | |
| # 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".' | |
| # }) | |
| # except Exception as e: | |
| # logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}") | |
| # session.pop('operation', None) | |
| # return jsonify({'result': f'root@math:~$ Error: {str(e)}'}) | |
| # @app.route('/upload', methods=['POST']) | |
| # def upload(): | |
| # filepath = None | |
| # try: | |
| # operation = session.get('operation') | |
| # if not operation: | |
| # return jsonify({ | |
| # 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".' | |
| # }) | |
| # if 'file' not in request.files: | |
| # return jsonify({'result': 'root@math:~$ No file uploaded.'}) | |
| # file = request.files['file'] | |
| # if file.filename == '' or not file.filename.lower().endswith('.mp3'): | |
| # return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'}) | |
| # filename = secure_filename(file.filename) | |
| # filepath = app.config['UPLOAD_FOLDER'] / filename | |
| # file.save(filepath) | |
| # wav_data = audio_processor.load_wav_16k_mono(filepath) | |
| # if operation == 'classify': | |
| # embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data) | |
| # embeddings = np.reshape(embeddings, (-1, 1024)) | |
| # result = np.argmax(audio_processor.classification_model.predict(embeddings)) | |
| # elif operation == 'transcribe': | |
| # result = audio_processor.pipe(str(filepath))['text'] | |
| # else: | |
| # result = 'Invalid operation' | |
| # return jsonify({ | |
| # 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".', | |
| # 'upload': False | |
| # }) | |
| # except Exception as e: | |
| # logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}") | |
| # return jsonify({ | |
| # 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".' | |
| # }) | |
| # finally: | |
| # session.pop('operation', None) | |
| # if filepath and Path(filepath).exists(): | |
| # try: | |
| # Path(filepath).unlink() | |
| # except Exception as e: | |
| # logger.error(f"Error deleting file {filepath}: {str(e)}") | |
| import os | |
| os.environ["KERAS_BACKEND"] = "jax" | |
| os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' | |
| import logging | |
| import numpy as np | |
| import librosa | |
| import tensorflow_hub as hub | |
| from flask import Flask, render_template, request, jsonify, session | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import keras | |
| import torch | |
| import io | |
| import traceback | |
| # Configure logging to print to terminal only | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class AudioProcessor: | |
| _instance = None | |
| _initialized = False | |
| def __new__(cls): | |
| if cls._instance is None: | |
| cls._instance = super(AudioProcessor, cls).__new__(cls) | |
| return cls._instance | |
| def __init__(self): | |
| if not AudioProcessor._initialized: | |
| self.device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| self.initialize_models() | |
| AudioProcessor._initialized = True | |
| def initialize_models(self): | |
| try: | |
| logger.info("Initializing models...") | |
| # Initialize transcription model | |
| model_id = "distil-whisper/distil-large-v3" | |
| self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
| ) | |
| self.transcription_model.to(self.device) | |
| self.processor = AutoProcessor.from_pretrained(model_id) | |
| # Initialize classification model | |
| self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification") | |
| # Initialize pipeline | |
| self.pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=self.transcription_model, | |
| tokenizer=self.processor.tokenizer, | |
| feature_extractor=self.processor.feature_extractor, | |
| max_new_tokens=128, | |
| chunk_length_s=25, | |
| batch_size=16, | |
| torch_dtype=self.torch_dtype, | |
| device=self.device, | |
| ) | |
| # Initialize YAMNet model | |
| self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') | |
| logger.info("Models initialized successfully") | |
| except Exception as e: | |
| logger.error(f"Error initializing models: {str(e)}") | |
| raise | |
| def load_wav_16k_mono(self, audio_data): | |
| try: | |
| # Load audio from bytes buffer instead of file | |
| wav, sr = librosa.load(io.BytesIO(audio_data), mono=True, sr=None) | |
| if sr != 16000: | |
| wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) | |
| return wav | |
| except Exception as e: | |
| logger.error(f"Error loading audio data: {str(e)}") | |
| raise | |
| def get_features_yamnet_extract_embedding(self, wav_data): | |
| try: | |
| scores, embeddings, spectrogram = self.yamnet_model(wav_data) | |
| return np.mean(embeddings.numpy(), axis=0) | |
| except Exception as e: | |
| logger.error(f"Error extracting YAMNet embeddings: {str(e)}") | |
| raise | |
| # Initialize Flask application | |
| app = Flask(__name__) | |
| app.secret_key = 'your_secret_key_here' | |
| app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 | |
| # Initialize audio processor (will only happen once) | |
| audio_processor = AudioProcessor() | |
| def index(): | |
| session.clear() | |
| return render_template('terminal.html') | |
| def process(): | |
| try: | |
| data = request.json | |
| command = data.get('command', '').strip().lower() | |
| if command in ['classify', 'transcribe']: | |
| session['operation'] = command | |
| return jsonify({ | |
| 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.', | |
| 'upload': True | |
| }) | |
| else: | |
| return jsonify({ | |
| 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".' | |
| }) | |
| except Exception as e: | |
| logger.error(f"Error in process route: {str(e)}") | |
| session.pop('operation', None) | |
| return jsonify({'result': f'root@math:~$ Error: {str(e)}'}) | |
| def upload(): | |
| try: | |
| operation = session.get('operation') | |
| if not operation: | |
| return jsonify({ | |
| 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".' | |
| }) | |
| if 'file' not in request.files: | |
| return jsonify({'result': 'root@math:~$ No file uploaded.'}) | |
| file = request.files['file'] | |
| if file.filename == '' or not file.filename.lower().endswith('.mp3'): | |
| return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'}) | |
| # Read file content into memory | |
| audio_data = file.read() | |
| wav_data = audio_processor.load_wav_16k_mono(audio_data) | |
| if operation == 'classify': | |
| embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data) | |
| embeddings = np.reshape(embeddings, (-1, 1024)) | |
| result = np.argmax(audio_processor.classification_model.predict(embeddings)) | |
| elif operation == 'transcribe': | |
| # Create temporary buffer for transcription | |
| audio_buffer = io.BytesIO(audio_data) | |
| result = audio_processor.pipe(audio_buffer)['text'] | |
| else: | |
| result = 'Invalid operation' | |
| return jsonify({ | |
| 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".', | |
| 'upload': False | |
| }) | |
| except Exception as e: | |
| logger.error(f"Error in upload route: {str(e)}") | |
| return jsonify({ | |
| 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".' | |
| }) | |
| finally: | |
| session.pop('operation', None) | |
| # if __name__ == '__main__': | |
| # app.run(host='0.0.0.0', port=7860) |