import os import re from datetime import datetime import gradio as gr import torch import pandas as pd import soundfile as sf import torchaudio from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC from src.transcription import SpeechEncoder from src.sentiment import TextEncoder # Configuration pour Hugging Face Spaces HF_SPACE = os.getenv("HF_SPACE", "false").lower() == "true" # Préchargement des modèles print("Chargement des modèles...") # Modèle français plus léger processor_ctc = Wav2Vec2Processor.from_pretrained( "LeBenchmark/wav2vec2-FR-2K-small", cache_dir="./models" if not HF_SPACE else None ) model_ctc = Wav2Vec2ForCTC.from_pretrained( "LeBenchmark/wav2vec2-FR-2K-small", cache_dir="./models" if not HF_SPACE else None ) speech_enc = SpeechEncoder() text_enc = TextEncoder() print("Modèles chargés avec succès!") # Pipeline d'analyse def analyze_audio(audio_path): if audio_path is None: return "Aucun audio fourni", "", pd.DataFrame(), {} try: # Lecture et prétraitement data, sr = sf.read(audio_path) arr = data.T if data.ndim > 1 else data wav = torch.from_numpy(arr).unsqueeze(0).float() if sr != 16000: wav = torchaudio.transforms.Resample(sr, 16000)(wav) sr = 16000 if wav.size(0) > 1: wav = wav.mean(dim=0, keepdim=True) # Transcription inputs = processor_ctc(wav.squeeze().numpy(), sampling_rate=sr, return_tensors="pt") with torch.no_grad(): logits = model_ctc(**inputs).logits pred_ids = torch.argmax(logits, dim=-1) transcription = processor_ctc.batch_decode(pred_ids)[0].lower() # Sentiment principal sent_dict = TextEncoder.analyze_sentiment(transcription) label, conf = max(sent_dict.items(), key=lambda x: x[1]) emojis = {"positif": "😊", "neutre": "😐", "négatif": "☹️"} emoji = emojis.get(label, "") # Segmentation par phrase segments = [s.strip() for s in re.split(r'[.?!]', transcription) if s.strip()] seg_results = [] for seg in segments: sd = TextEncoder.analyze_sentiment(seg) l, c = max(sd.items(), key=lambda x: x[1]) seg_results.append({"Segment": seg, "Sentiment": l.capitalize(), "Confiance (%)": round(c*100,1)}) seg_df = pd.DataFrame(seg_results) # Historique entry timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") history_entry = { "Horodatage": timestamp, "Transcription": transcription, "Sentiment": label.capitalize(), "Confiance (%)": round(conf*100,1) } # Rendu summary_html = ( f"

" f"{emoji}" f"

{label.upper()}

" f"

Confiance : {conf*100:.1f}%

" ) return transcription, summary_html, seg_df, history_entry except Exception as e: error_msg = f"Erreur lors de l'analyse: {str(e)}" return error_msg, "", pd.DataFrame(), {} # Export CSV def export_history_csv(history): if not history: return None df = pd.DataFrame(history) path = "history.csv" df.to_csv(path, index=False) return path # Interface Gradio demo = gr.Blocks( theme=gr.themes.Monochrome(primary_hue="purple"), title="Analyse de Sentiment Audio - Hugging Face Space" ) with demo: gr.Markdown(""" # 🎤 Analyse de Sentiment Audio Ce Space permet d'analyser le sentiment d'extraits audio en français en combinant : - **Transcription audio** avec Wav2Vec2 - **Analyse de sentiment** avec BERT multilingue """) gr.HTML("""

Étape 1 : Enregistrez votre voix ou téléversez un fichier audio (format WAV recommandé).

Étape 2 : Cliquez sur le bouton Analyser pour lancer la transcription et l'analyse.

Étape 3 : Visualisez les résultats : transcription, sentiment, et analyse détaillée.

Étape 4 : Exportez l'historique des analyses au format CSV si besoin.

""") # Section API with gr.Accordion("🔌 API REST", open=False): gr.Markdown(""" ### Endpoints disponibles : - **`/api/predict`** - Analyse audio (POST) - **`/api/predict_text`** - Analyse textuelle (POST) - **`/api/health`** - Vérification état (GET) - **`/api/docs`** - Documentation Swagger ### Exemple d'utilisation : ```bash curl -X POST "https://huggingface.co/spaces//sentiment-audio-analyzer/api/predict" \ -F "file=@audio.wav" ``` """) with gr.Row(): with gr.Column(scale=2): audio_in = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Audio Input" ) btn = gr.Button("🔍 Analyser", variant="primary") export_btn = gr.Button("📊 Exporter CSV") with gr.Column(scale=3): chat = gr.Chatbot(label="Historique des échanges") transcription_out = gr.Textbox(label="Transcription", interactive=False) summary_out = gr.HTML(label="Sentiment") seg_out = gr.Dataframe(label="Détail par segment") hist_out = gr.Dataframe(label="Historique") state_chat = gr.State([]) # list of (user,bot) state_hist = gr.State([]) # list of dict entries def chat_callback(audio_path, chat_history, hist_state): transcription, summary, seg_df, hist_entry = analyze_audio(audio_path) user_msg = "[Audio reçu]" bot_msg = f"**Transcription :** {transcription}\n**Sentiment :** {summary}" chat_history = chat_history + [(user_msg, bot_msg)] if hist_entry: hist_state = hist_state + [hist_entry] return chat_history, transcription, summary, seg_df, hist_state btn.click( fn=chat_callback, inputs=[audio_in, state_chat, state_hist], outputs=[chat, transcription_out, summary_out, seg_out, state_hist] ) export_btn.click( fn=export_history_csv, inputs=[state_hist], outputs=[gr.File(label="Télécharger CSV")] ) # Configuration pour Hugging Face Spaces if __name__ == "__main__": demo.launch( server_name="0.0.0.0" if HF_SPACE else "127.0.0.1", server_port=7860, share=False )