import os import re from datetime import datetime import gradio as gr import torch import pandas as pd import soundfile as sf import torchaudio from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC from src.transcription import SpeechEncoder from src.sentiment import TextEncoder # Configuration pour Hugging Face Spaces HF_SPACE = os.getenv("HF_SPACE", "false").lower() == "true" # Préchargement des modèles print("Chargement des modèles...") # Modèle français plus léger processor_ctc = Wav2Vec2Processor.from_pretrained( "LeBenchmark/wav2vec2-FR-2K-small", cache_dir="./models" if not HF_SPACE else None ) model_ctc = Wav2Vec2ForCTC.from_pretrained( "LeBenchmark/wav2vec2-FR-2K-small", cache_dir="./models" if not HF_SPACE else None ) speech_enc = SpeechEncoder() text_enc = TextEncoder() print("Modèles chargés avec succès!") # Pipeline d'analyse def analyze_audio(audio_path): if audio_path is None: return "Aucun audio fourni", "", pd.DataFrame(), {} try: # Lecture et prétraitement data, sr = sf.read(audio_path) arr = data.T if data.ndim > 1 else data wav = torch.from_numpy(arr).unsqueeze(0).float() if sr != 16000: wav = torchaudio.transforms.Resample(sr, 16000)(wav) sr = 16000 if wav.size(0) > 1: wav = wav.mean(dim=0, keepdim=True) # Transcription inputs = processor_ctc(wav.squeeze().numpy(), sampling_rate=sr, return_tensors="pt") with torch.no_grad(): logits = model_ctc(**inputs).logits pred_ids = torch.argmax(logits, dim=-1) transcription = processor_ctc.batch_decode(pred_ids)[0].lower() # Sentiment principal sent_dict = TextEncoder.analyze_sentiment(transcription) label, conf = max(sent_dict.items(), key=lambda x: x[1]) emojis = {"positif": "😊", "neutre": "😐", "négatif": "☹️"} emoji = emojis.get(label, "") # Segmentation par phrase segments = [s.strip() for s in re.split(r'[.?!]', transcription) if s.strip()] seg_results = [] for seg in segments: sd = TextEncoder.analyze_sentiment(seg) l, c = max(sd.items(), key=lambda x: x[1]) seg_results.append({"Segment": seg, "Sentiment": l.capitalize(), "Confiance (%)": round(c*100,1)}) seg_df = pd.DataFrame(seg_results) # Historique entry timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") history_entry = { "Horodatage": timestamp, "Transcription": transcription, "Sentiment": label.capitalize(), "Confiance (%)": round(conf*100,1) } # Rendu summary_html = ( f"
Confiance : {conf*100:.1f}%
" ) return transcription, summary_html, seg_df, history_entry except Exception as e: error_msg = f"Erreur lors de l'analyse: {str(e)}" return error_msg, "", pd.DataFrame(), {} # Export CSV def export_history_csv(history): if not history: return None df = pd.DataFrame(history) path = "history.csv" df.to_csv(path, index=False) return path # Interface Gradio demo = gr.Blocks( theme=gr.themes.Monochrome(primary_hue="purple"), title="Analyse de Sentiment Audio - Hugging Face Space" ) with demo: gr.Markdown(""" # 🎤 Analyse de Sentiment Audio Ce Space permet d'analyser le sentiment d'extraits audio en français en combinant : - **Transcription audio** avec Wav2Vec2 - **Analyse de sentiment** avec BERT multilingue """) gr.HTML("""