import os import re from datetime import datetime import asyncio import threading import gradio as gr import torch import pandas as pd import soundfile as sf import torchaudio from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware import torch.nn.functional as F import uvicorn from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC from src.transcription import SpeechEncoder from src.sentiment import TextEncoder from src.multimodal import MultimodalSentimentClassifier # Configuration pour Hugging Face Spaces HF_SPACE = os.getenv("HF_SPACE", "false").lower() == "true" # Préchargement des modèles (partagés entre Gradio et API) print("Chargement des modèles...") # Modèle français plus léger processor_ctc = Wav2Vec2Processor.from_pretrained( "LeBenchmark/wav2vec2-FR-2K-small", cache_dir="./models" if not HF_SPACE else None ) model_ctc = Wav2Vec2ForCTC.from_pretrained( "LeBenchmark/wav2vec2-FR-2K-small", cache_dir="./models" if not HF_SPACE else None ) speech_enc = SpeechEncoder() text_enc = TextEncoder() print("Modèles chargés avec succès!") # ===== FONCTIONS PARTAGÉES ===== def transcribe_ctc(wav_path: str) -> str: """Transcription audio avec Wav2Vec2""" try: waveform, sr = torchaudio.load(wav_path) if sr != 16000: waveform = torchaudio.transforms.Resample(sr, 16000)(waveform) if waveform.size(0) > 1: waveform = waveform.mean(dim=0, keepdim=True) inputs = processor_ctc( waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True ) with torch.no_grad(): logits = model_ctc(**inputs).logits pred_ids = torch.argmax(logits, dim=-1) transcription = processor_ctc.batch_decode(pred_ids)[0].lower() return transcription except Exception as e: raise Exception(f"Erreur transcription: {str(e)}") def analyze_audio(audio_path): """Analyse audio pour Gradio""" if audio_path is None: return "Aucun audio fourni", "", pd.DataFrame(), {} try: # Lecture et prétraitement data, sr = sf.read(audio_path) arr = data.T if data.ndim > 1 else data wav = torch.from_numpy(arr).unsqueeze(0).float() if sr != 16000: wav = torchaudio.transforms.Resample(sr, 16000)(wav) sr = 16000 if wav.size(0) > 1: wav = wav.mean(dim=0, keepdim=True) # Transcription inputs = processor_ctc(wav.squeeze().numpy(), sampling_rate=sr, return_tensors="pt") with torch.no_grad(): logits = model_ctc(**inputs).logits pred_ids = torch.argmax(logits, dim=-1) transcription = processor_ctc.batch_decode(pred_ids)[0].lower() # Sentiment principal sent_dict = TextEncoder.analyze_sentiment(transcription) label, conf = max(sent_dict.items(), key=lambda x: x[1]) emojis = {"positif": "😊", "neutre": "😐", "négatif": "☹️"} emoji = emojis.get(label, "") # Segmentation par phrase segments = [s.strip() for s in re.split(r'[.?!]', transcription) if s.strip()] seg_results = [] for seg in segments: sd = TextEncoder.analyze_sentiment(seg) l, c = max(sd.items(), key=lambda x: x[1]) seg_results.append({"Segment": seg, "Sentiment": l.capitalize(), "Confiance (%)": round(c*100,1)}) seg_df = pd.DataFrame(seg_results) # Historique entry timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") history_entry = { "Horodatage": timestamp, "Transcription": transcription, "Sentiment": label.capitalize(), "Confiance (%)": round(conf*100,1) } # Rendu summary_html = ( f"
" f"{emoji}" f"

{label.upper()}

" f"
" f"

Confiance : {conf*100:.1f}%

" ) return transcription, summary_html, seg_df, history_entry except Exception as e: error_msg = f"Erreur lors de l'analyse: {str(e)}" return error_msg, "", pd.DataFrame(), {} # ===== API FASTAPI ===== app = FastAPI( title="API Multimodale de Transcription & Sentiment", description="API pour l'analyse de sentiment audio en français", version="1.0", docs_url="/api/docs", redoc_url="/api/redoc" ) # Configuration CORS app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/api/") async def root(): """Endpoint racine avec informations sur l'API""" return { "message": "API Multimodale de Transcription & Sentiment", "version": "1.0", "endpoints": { "docs": "/api/docs", "predict": "/api/predict", "health": "/api/health" }, "supported_formats": ["wav", "flac", "mp3"] } @app.get("/api/health") async def health_check(): """Vérification de l'état de l'API""" return { "status": "healthy", "models_loaded": True, "timestamp": "2024-01-01T00:00:00Z" } @app.post("/api/predict") async def predict(file: UploadFile = File(...)): """Analyse de sentiment audio""" # 1. Vérifier le type de fichier if not file.filename or not file.filename.lower().endswith((".wav", ".flac", ".mp3")): raise HTTPException( status_code=400, detail="Seuls les fichiers audio WAV/FLAC/MP3 sont acceptés." ) # 2. Vérifier la taille du fichier (max 50MB) content = await file.read() if len(content) > 50 * 1024 * 1024: # 50MB raise HTTPException( status_code=400, detail="Fichier trop volumineux. Taille maximale: 50MB" ) # 3. Sauvegarder temporairement import tempfile suffix = os.path.splitext(file.filename)[1] with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: tmp.write(content) tmp_path = tmp.name try: # 4. Transcription transcription = transcribe_ctc(tmp_path) if not transcription.strip(): return JSONResponse({ "transcription": "", "sentiment": {"négatif": 0.33, "neutre": 0.34, "positif": 0.33}, "warning": "Aucune transcription détectée" }) # 5. Features multimodales try: audio_feat = speech_enc.extract_features(tmp_path) text_feat = text_enc.extract_features([transcription]) # 6. Classification logits = model_mm.classifier(torch.cat([audio_feat, text_feat], dim=1)) probs = F.softmax(logits, dim=1).squeeze().tolist() labels = ["négatif", "neutre", "positif"] sentiment = {labels[i]: round(probs[i], 3) for i in range(len(labels))} except Exception as e: # Fallback vers analyse textuelle uniquement print(f"Erreur multimodal, fallback textuel: {e}") sent_dict = TextEncoder.analyze_sentiment(transcription) sentiment = {k: round(v, 3) for k, v in sent_dict.items()} return JSONResponse({ "transcription": transcription, "sentiment": sentiment, "filename": file.filename, "file_size": len(content) }) except Exception as e: raise HTTPException(status_code=500, detail=f"Erreur lors de l'analyse: {str(e)}") finally: # Nettoyage fichier temporaire try: os.remove(tmp_path) except: pass @app.post("/api/predict_text") async def predict_text(text: str): """Analyse de sentiment textuel uniquement""" try: sent_dict = TextEncoder.analyze_sentiment(text) sentiment = {k: round(v, 3) for k, v in sent_dict.items()} return JSONResponse({ "text": text, "sentiment": sentiment }) except Exception as e: raise HTTPException(status_code=500, detail=f"Erreur analyse textuelle: {str(e)}") # ===== INTERFACE GRADIO ===== def export_history_csv(history): if not history: return None df = pd.DataFrame(history) path = "history.csv" df.to_csv(path, index=False) return path # Interface Gradio demo = gr.Blocks( theme=gr.themes.Monochrome(primary_hue="purple"), title="Analyse de Sentiment Audio - Hugging Face Space" ) with demo: gr.Markdown(""" # 🎤 Analyse de Sentiment Audio Ce Space permet d'analyser le sentiment d'extraits audio en français en combinant : - **Transcription audio** avec Wav2Vec2 - **Analyse de sentiment** avec BERT multilingue - **API REST** pour intégration """) gr.HTML("""
Étape 1 : Enregistrez votre voix ou téléversez un fichier audio (format WAV recommandé).
Étape 2 : Cliquez sur le bouton Analyser pour lancer la transcription et l'analyse.
Étape 3 : Visualisez les résultats : transcription, sentiment, et analyse détaillée.
Étape 4 : Exportez l'historique des analyses au format CSV si besoin.
""") # Section API with gr.Accordion("🔌 API REST", open=False): gr.Markdown(""" ### Endpoints disponibles : - **`/api/predict`** - Analyse audio (POST) - **`/api/predict_text`** - Analyse textuelle (POST) - **`/api/health`** - Vérification état (GET) - **`/api/docs`** - Documentation Swagger ### Exemple d'utilisation : ```bash curl -X POST "https://huggingface.co/spaces//sentiment-audio-analyzer/api/predict" \ -F "file=@audio.wav" ``` """) with gr.Row(): with gr.Column(scale=2): audio_in = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Audio Input" ) btn = gr.Button("🔍 Analyser", variant="primary") export_btn = gr.Button("📊 Exporter CSV") with gr.Column(scale=3): chat = gr.Chatbot(label="Historique des échanges") transcription_out = gr.Textbox(label="Transcription", interactive=False) summary_out = gr.HTML(label="Sentiment") seg_out = gr.Dataframe(label="Détail par segment") hist_out = gr.Dataframe(label="Historique") state_chat = gr.State([]) state_hist = gr.State([]) def chat_callback(audio_path, chat_history, hist_state): transcription, summary, seg_df, hist_entry = analyze_audio(audio_path) user_msg = "[Audio reçu]" bot_msg = f"**Transcription :** {transcription}\n**Sentiment :** {summary}" chat_history = chat_history + [(user_msg, bot_msg)] if hist_entry: hist_state = hist_state + [hist_entry] return chat_history, transcription, summary, seg_df, hist_state btn.click( fn=chat_callback, inputs=[audio_in, state_chat, state_hist], outputs=[chat, transcription_out, summary_out, seg_out, state_hist] ) export_btn.click( fn=export_history_csv, inputs=[state_hist], outputs=[gr.File(label="Télécharger CSV")] ) # ===== INTÉGRATION GRADIO + FASTAPI ===== # Monter l'API FastAPI dans Gradio app = gr.mount_gradio_app(app, demo, path="/") # Configuration pour Hugging Face Spaces if __name__ == "__main__": uvicorn.run( app, host="0.0.0.0" if HF_SPACE else "127.0.0.1", port=7860, log_level="info" )