Spaces:
Runtime error
Runtime error
import os | |
import re | |
from datetime import datetime | |
import asyncio | |
import threading | |
import gradio as gr | |
import torch | |
import pandas as pd | |
import soundfile as sf | |
import torchaudio | |
from fastapi import FastAPI, File, UploadFile, HTTPException | |
from fastapi.responses import JSONResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
import torch.nn.functional as F | |
import uvicorn | |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
from src.transcription import SpeechEncoder | |
from src.sentiment import TextEncoder | |
from src.multimodal import MultimodalSentimentClassifier | |
# Configuration pour Hugging Face Spaces | |
HF_SPACE = os.getenv("HF_SPACE", "false").lower() == "true" | |
# Préchargement des modèles (partagés entre Gradio et API) | |
print("Chargement des modèles...") | |
# Modèle français plus léger | |
processor_ctc = Wav2Vec2Processor.from_pretrained( | |
"LeBenchmark/wav2vec2-FR-2K-small", | |
cache_dir="./models" if not HF_SPACE else None | |
) | |
model_ctc = Wav2Vec2ForCTC.from_pretrained( | |
"LeBenchmark/wav2vec2-FR-2K-small", | |
cache_dir="./models" if not HF_SPACE else None | |
) | |
speech_enc = SpeechEncoder() | |
text_enc = TextEncoder() | |
print("Modèles chargés avec succès!") | |
# ===== FONCTIONS PARTAGÉES ===== | |
def transcribe_ctc(wav_path: str) -> str: | |
"""Transcription audio avec Wav2Vec2""" | |
try: | |
waveform, sr = torchaudio.load(wav_path) | |
if sr != 16000: | |
waveform = torchaudio.transforms.Resample(sr, 16000)(waveform) | |
if waveform.size(0) > 1: | |
waveform = waveform.mean(dim=0, keepdim=True) | |
inputs = processor_ctc( | |
waveform.squeeze().numpy(), | |
sampling_rate=16000, | |
return_tensors="pt", | |
padding=True | |
) | |
with torch.no_grad(): | |
logits = model_ctc(**inputs).logits | |
pred_ids = torch.argmax(logits, dim=-1) | |
transcription = processor_ctc.batch_decode(pred_ids)[0].lower() | |
return transcription | |
except Exception as e: | |
raise Exception(f"Erreur transcription: {str(e)}") | |
def analyze_audio(audio_path): | |
"""Analyse audio pour Gradio""" | |
if audio_path is None: | |
return "Aucun audio fourni", "", pd.DataFrame(), {} | |
try: | |
# Lecture et prétraitement | |
data, sr = sf.read(audio_path) | |
arr = data.T if data.ndim > 1 else data | |
wav = torch.from_numpy(arr).unsqueeze(0).float() | |
if sr != 16000: | |
wav = torchaudio.transforms.Resample(sr, 16000)(wav) | |
sr = 16000 | |
if wav.size(0) > 1: | |
wav = wav.mean(dim=0, keepdim=True) | |
# Transcription | |
inputs = processor_ctc(wav.squeeze().numpy(), sampling_rate=sr, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model_ctc(**inputs).logits | |
pred_ids = torch.argmax(logits, dim=-1) | |
transcription = processor_ctc.batch_decode(pred_ids)[0].lower() | |
# Sentiment principal | |
sent_dict = TextEncoder.analyze_sentiment(transcription) | |
label, conf = max(sent_dict.items(), key=lambda x: x[1]) | |
emojis = {"positif": "😊", "neutre": "😐", "négatif": "☹️"} | |
emoji = emojis.get(label, "") | |
# Segmentation par phrase | |
segments = [s.strip() for s in re.split(r'[.?!]', transcription) if s.strip()] | |
seg_results = [] | |
for seg in segments: | |
sd = TextEncoder.analyze_sentiment(seg) | |
l, c = max(sd.items(), key=lambda x: x[1]) | |
seg_results.append({"Segment": seg, "Sentiment": l.capitalize(), "Confiance (%)": round(c*100,1)}) | |
seg_df = pd.DataFrame(seg_results) | |
# Historique entry | |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
history_entry = { | |
"Horodatage": timestamp, | |
"Transcription": transcription, | |
"Sentiment": label.capitalize(), | |
"Confiance (%)": round(conf*100,1) | |
} | |
# Rendu | |
summary_html = ( | |
f"<div style='display:flex;align-items:center;'>" | |
f"<span style='font-size:3rem;margin-right:10px;'>{emoji}</span>" | |
f"<h2 style='color:#6a0dad;'>{label.upper()}</h2>" | |
f"</div>" | |
f"<p><strong>Confiance :</strong> {conf*100:.1f}%</p>" | |
) | |
return transcription, summary_html, seg_df, history_entry | |
except Exception as e: | |
error_msg = f"Erreur lors de l'analyse: {str(e)}" | |
return error_msg, "", pd.DataFrame(), {} | |
# ===== API FASTAPI ===== | |
app = FastAPI( | |
title="API Multimodale de Transcription & Sentiment", | |
description="API pour l'analyse de sentiment audio en français", | |
version="1.0", | |
docs_url="/api/docs", | |
redoc_url="/api/redoc" | |
) | |
# Configuration CORS | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def root(): | |
"""Endpoint racine avec informations sur l'API""" | |
return { | |
"message": "API Multimodale de Transcription & Sentiment", | |
"version": "1.0", | |
"endpoints": { | |
"docs": "/api/docs", | |
"predict": "/api/predict", | |
"health": "/api/health" | |
}, | |
"supported_formats": ["wav", "flac", "mp3"] | |
} | |
async def health_check(): | |
"""Vérification de l'état de l'API""" | |
return { | |
"status": "healthy", | |
"models_loaded": True, | |
"timestamp": "2024-01-01T00:00:00Z" | |
} | |
async def predict(file: UploadFile = File(...)): | |
"""Analyse de sentiment audio""" | |
# 1. Vérifier le type de fichier | |
if not file.filename or not file.filename.lower().endswith((".wav", ".flac", ".mp3")): | |
raise HTTPException( | |
status_code=400, | |
detail="Seuls les fichiers audio WAV/FLAC/MP3 sont acceptés." | |
) | |
# 2. Vérifier la taille du fichier (max 50MB) | |
content = await file.read() | |
if len(content) > 50 * 1024 * 1024: # 50MB | |
raise HTTPException( | |
status_code=400, | |
detail="Fichier trop volumineux. Taille maximale: 50MB" | |
) | |
# 3. Sauvegarder temporairement | |
import tempfile | |
suffix = os.path.splitext(file.filename)[1] | |
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: | |
tmp.write(content) | |
tmp_path = tmp.name | |
try: | |
# 4. Transcription | |
transcription = transcribe_ctc(tmp_path) | |
if not transcription.strip(): | |
return JSONResponse({ | |
"transcription": "", | |
"sentiment": {"négatif": 0.33, "neutre": 0.34, "positif": 0.33}, | |
"warning": "Aucune transcription détectée" | |
}) | |
# 5. Features multimodales | |
try: | |
audio_feat = speech_enc.extract_features(tmp_path) | |
text_feat = text_enc.extract_features([transcription]) | |
# 6. Classification | |
logits = model_mm.classifier(torch.cat([audio_feat, text_feat], dim=1)) | |
probs = F.softmax(logits, dim=1).squeeze().tolist() | |
labels = ["négatif", "neutre", "positif"] | |
sentiment = {labels[i]: round(probs[i], 3) for i in range(len(labels))} | |
except Exception as e: | |
# Fallback vers analyse textuelle uniquement | |
print(f"Erreur multimodal, fallback textuel: {e}") | |
sent_dict = TextEncoder.analyze_sentiment(transcription) | |
sentiment = {k: round(v, 3) for k, v in sent_dict.items()} | |
return JSONResponse({ | |
"transcription": transcription, | |
"sentiment": sentiment, | |
"filename": file.filename, | |
"file_size": len(content) | |
}) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Erreur lors de l'analyse: {str(e)}") | |
finally: | |
# Nettoyage fichier temporaire | |
try: | |
os.remove(tmp_path) | |
except: | |
pass | |
async def predict_text(text: str): | |
"""Analyse de sentiment textuel uniquement""" | |
try: | |
sent_dict = TextEncoder.analyze_sentiment(text) | |
sentiment = {k: round(v, 3) for k, v in sent_dict.items()} | |
return JSONResponse({ | |
"text": text, | |
"sentiment": sentiment | |
}) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Erreur analyse textuelle: {str(e)}") | |
# ===== INTERFACE GRADIO ===== | |
def export_history_csv(history): | |
if not history: | |
return None | |
df = pd.DataFrame(history) | |
path = "history.csv" | |
df.to_csv(path, index=False) | |
return path | |
# Interface Gradio | |
demo = gr.Blocks( | |
theme=gr.themes.Monochrome(primary_hue="purple"), | |
title="Analyse de Sentiment Audio - Hugging Face Space" | |
) | |
with demo: | |
gr.Markdown(""" | |
# 🎤 Analyse de Sentiment Audio | |
Ce Space permet d'analyser le sentiment d'extraits audio en français en combinant : | |
- **Transcription audio** avec Wav2Vec2 | |
- **Analyse de sentiment** avec BERT multilingue | |
- **API REST** pour intégration | |
""") | |
gr.HTML(""" | |
<div style="display: flex; flex-direction: column; gap: 10px; margin-bottom: 20px;"> | |
<div style="background-color: #f3e8ff; padding: 12px 20px; border-radius: 12px; border-left: 5px solid #8e44ad;"> | |
<strong>Étape 1 :</strong> Enregistrez votre voix ou téléversez un fichier audio (format WAV recommandé). | |
</div> | |
<div style="background-color: #e0f7fa; padding: 12px 20px; border-radius: 12px; border-left: 5px solid #0097a7;"> | |
<strong>Étape 2 :</strong> Cliquez sur le bouton <em><b>Analyser</b></em> pour lancer la transcription et l'analyse. | |
</div> | |
<div style="background-color: #fff3e0; padding: 12px 20px; border-radius: 12px; border-left: 5px solid #fb8c00;"> | |
<strong>Étape 3 :</strong> Visualisez les résultats : transcription, sentiment, et analyse détaillée. | |
</div> | |
<div style="background-color: #e8f5e9; padding: 12px 20px; border-radius: 12px; border-left: 5px solid #43a047;"> | |
<strong>Étape 4 :</strong> Exportez l'historique des analyses au format CSV si besoin. | |
</div> | |
</div> | |
""") | |
# Section API | |
with gr.Accordion("🔌 API REST", open=False): | |
gr.Markdown(""" | |
### Endpoints disponibles : | |
- **`/api/predict`** - Analyse audio (POST) | |
- **`/api/predict_text`** - Analyse textuelle (POST) | |
- **`/api/health`** - Vérification état (GET) | |
- **`/api/docs`** - Documentation Swagger | |
### Exemple d'utilisation : | |
```bash | |
curl -X POST "https://huggingface.co/spaces/<username>/sentiment-audio-analyzer/api/predict" \ | |
-F "file=@audio.wav" | |
``` | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
audio_in = gr.Audio( | |
sources=["microphone", "upload"], | |
type="filepath", | |
label="Audio Input" | |
) | |
btn = gr.Button("🔍 Analyser", variant="primary") | |
export_btn = gr.Button("📊 Exporter CSV") | |
with gr.Column(scale=3): | |
chat = gr.Chatbot(label="Historique des échanges") | |
transcription_out = gr.Textbox(label="Transcription", interactive=False) | |
summary_out = gr.HTML(label="Sentiment") | |
seg_out = gr.Dataframe(label="Détail par segment") | |
hist_out = gr.Dataframe(label="Historique") | |
state_chat = gr.State([]) | |
state_hist = gr.State([]) | |
def chat_callback(audio_path, chat_history, hist_state): | |
transcription, summary, seg_df, hist_entry = analyze_audio(audio_path) | |
user_msg = "[Audio reçu]" | |
bot_msg = f"**Transcription :** {transcription}\n**Sentiment :** {summary}" | |
chat_history = chat_history + [(user_msg, bot_msg)] | |
if hist_entry: | |
hist_state = hist_state + [hist_entry] | |
return chat_history, transcription, summary, seg_df, hist_state | |
btn.click( | |
fn=chat_callback, | |
inputs=[audio_in, state_chat, state_hist], | |
outputs=[chat, transcription_out, summary_out, seg_out, state_hist] | |
) | |
export_btn.click( | |
fn=export_history_csv, | |
inputs=[state_hist], | |
outputs=[gr.File(label="Télécharger CSV")] | |
) | |
# ===== INTÉGRATION GRADIO + FASTAPI ===== | |
# Monter l'API FastAPI dans Gradio | |
app = gr.mount_gradio_app(app, demo, path="/") | |
# Configuration pour Hugging Face Spaces | |
if __name__ == "__main__": | |
uvicorn.run( | |
app, | |
host="0.0.0.0" if HF_SPACE else "127.0.0.1", | |
port=7860, | |
log_level="info" | |
) |