| import os |
| import re |
| import numpy as np |
| import torch |
| import spaces |
| import gradio as gr |
|
|
| |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| WHISPER_SR = 16_000 |
|
|
| ASR_MODELS = { |
| "whisper-small (schnell)": "openai/whisper-small", |
| "whisper-medium": "openai/whisper-medium", |
| "whisper-large-v3 (empfohlen)": "openai/whisper-large-v3", |
| } |
|
|
| _asr_cache: dict = {} |
| _diar_pipe = None |
|
|
|
|
| |
|
|
| def get_asr(model_key: str, device: str, dtype: torch.dtype): |
| from transformers import AutoProcessor, WhisperForConditionalGeneration |
| model_id = ASR_MODELS[model_key] |
| if model_id not in _asr_cache: |
| processor = AutoProcessor.from_pretrained(model_id) |
| model = WhisperForConditionalGeneration.from_pretrained( |
| model_id, torch_dtype=dtype, |
| ).to(device) |
| model.eval() |
| _asr_cache[model_id] = (processor, model) |
| return _asr_cache[model_id] |
|
|
|
|
| def get_diar(device: str): |
| global _diar_pipe |
| if _diar_pipe is None: |
| if not HF_TOKEN: |
| raise EnvironmentError( |
| "HF_TOKEN nicht gesetzt. FΓΌge ihn in den Space-Settings unter " |
| "'Settings -> Variables and secrets' hinzu." |
| ) |
| from pyannote.audio import Pipeline as PyannotePipeline |
| _diar_pipe = PyannotePipeline.from_pretrained( |
| "pyannote/speaker-diarization-3.1", |
| token=HF_TOKEN, |
| ) |
| if device == "cuda": |
| _diar_pipe = _diar_pipe.to(torch.device("cuda")) |
| return _diar_pipe |
|
|
|
|
| |
|
|
| def resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: |
| if orig_sr == target_sr: |
| return audio |
| new_len = int(len(audio) * target_sr / orig_sr) |
| return np.interp( |
| np.linspace(0, len(audio) - 1, new_len), |
| np.arange(len(audio)), |
| audio, |
| ).astype(np.float32) |
|
|
|
|
| def chunk_audio(audio: np.ndarray, sr: int, chunk_s: int = 28) -> list: |
| chunk_len = sr * chunk_s |
| if len(audio) <= chunk_len: |
| return [audio] |
| chunks, step = [], sr * (chunk_s - 2) |
| for start in range(0, len(audio), step): |
| chunks.append(audio[start: start + chunk_len]) |
| return chunks |
|
|
|
|
| |
|
|
| def transcribe_audio(audio_16k, processor, model, device, dtype): |
| chunks = chunk_audio(audio_16k, WHISPER_SR) |
| full_text, all_chunks, offset = [], [], 0.0 |
|
|
| for chunk in chunks: |
| inputs = processor(chunk, sampling_rate=WHISPER_SR, return_tensors="pt") |
| input_features = inputs.input_features.to(device=device, dtype=dtype) |
|
|
| with torch.no_grad(): |
| predicted_ids = model.generate( |
| input_features, |
| return_timestamps=True, |
| language="de", |
| ) |
|
|
| result = processor.batch_decode( |
| predicted_ids, |
| decode_with_timestamps=True, |
| skip_special_tokens=False, |
| )[0] |
|
|
| result = re.sub(r"<\|(?![\d.]+\|)[^>]+\|>", "", result).strip() |
| ts_pattern = re.compile(r"<\|([\d.]+)\|>") |
| tokens = ts_pattern.split(result) |
|
|
| segment_start = offset |
| for token in tokens: |
| try: |
| segment_start = offset + float(token) |
| except ValueError: |
| text = token.strip() |
| if text: |
| all_chunks.append({"timestamp": (segment_start, segment_start + 1.0), "text": text}) |
| full_text.append(text) |
|
|
| offset += len(chunk) / WHISPER_SR |
|
|
| return " ".join(full_text).strip(), all_chunks |
|
|
|
|
| |
|
|
| def unwrap_diarization(result): |
| """Robust gegen verschiedene pyannote RΓΌckgabetypen (Annotation, DiarizeOutput, NamedTuple...).""" |
| |
| if hasattr(result, "itertracks"): |
| return result |
| |
| for attr in ("speaker_diarization", "exclusive_speaker_diarization", "diarization", "annotation", "output"): |
| val = getattr(result, attr, None) |
| if val is not None and hasattr(val, "itertracks"): |
| return val |
| |
| if hasattr(result, "_fields"): |
| for val in result: |
| if hasattr(val, "itertracks"): |
| return val |
| |
| for key in ("diarization", "annotation"): |
| try: |
| val = result[key] |
| if hasattr(val, "itertracks"): |
| return val |
| except (KeyError, TypeError, IndexError): |
| pass |
| |
| return result |
|
|
|
|
| def merge_with_speakers(chunks, diarization): |
| diarization = unwrap_diarization(diarization) |
| merged = [] |
| for chunk in chunks: |
| start, end = chunk["timestamp"] |
| end = end or (start + 1.0) |
| best_speaker, best_overlap = "Unbekannt", 0.0 |
| for turn, _, speaker in diarization.itertracks(yield_label=True): |
| overlap = max(0.0, min(end, turn.end) - max(start, turn.start)) |
| if overlap > best_overlap: |
| best_overlap, best_speaker = overlap, speaker |
| merged.append((start, end, best_speaker, chunk["text"].strip())) |
| return merged |
|
|
|
|
| def format_diarized(segments): |
| if not segments: |
| return "" |
| lines, cur_speaker, cur_start, cur_texts = [], None, 0.0, [] |
| for start, _end, speaker, text in segments: |
| if speaker != cur_speaker: |
| if cur_speaker is not None: |
| lines.append(f"[{cur_speaker}] {cur_start:.1f}s\n{' '.join(cur_texts)}") |
| cur_speaker, cur_start, cur_texts = speaker, start, [text] |
| else: |
| cur_texts.append(text) |
| if cur_speaker and cur_texts: |
| lines.append(f"[{cur_speaker}] {cur_start:.1f}s\n{' '.join(cur_texts)}") |
| return "\n\n".join(lines) |
|
|
|
|
| |
|
|
| SYSTEM_PROMPT = """Du bist ein strukturierter technischer Projektmanager und Assistent. Deine Aufgabe ist es, das folgende Transkript eines Entwickler-Sync-Calls (z.B. aus Microsoft Teams) prΓ€zise und ΓΌbersichtlich zusammenzufassen. |
| Besonderheiten des Transkripts: |
| * Es kann sich um rohe, unstrukturierte Sprache mit vielen FΓΌllwΓΆrtern ("Ja", "Genau", "Γhm") handeln. |
| * Das Transkript kann einseitig sein (nur ein Sprecher wurde aufgenommen). Falls das der Fall ist, rekonstruiere den fehlenden Kontext logisch aus den Antworten des aufgenommenen Sprechers. |
| * GedankensprΓΌnge sind normal. BΓΌndele die Informationen thematisch, nicht chronologisch. |
| GewΓΌnschtes Ausgabeformat (in Markdown): |
| Bitte strukturiere deine Antwort in die folgenden Kategorien. Lasse Kategorien weg, falls es im Text keine passenden Informationen dazu gibt. |
| * π― Kernpunkte & Entscheidungen: Was war der Hauptgrund des GesprΓ€chs? Welche Entscheidungen wurden getroffen? |
| * π» Code & Technik: Welche Repositories, Branches, Tools (z.B. SonarQube, Maven) oder spezifischen technischen Probleme wurden besprochen? |
| * β
Action Items (To-Dos): Wer macht was als NΓ€chstes? Bitte ordne die Aufgaben klar zu (z.B. "Sprecher 1 kΓΌmmert sich um...", "Kollege soll Feedback geben zu..."). |
| * π
Orga, Termine & Dailys: Wurden Meetings verschoben? Gibt es Absprachen fΓΌr das nΓ€chste Daily oder private/teaminterne Events? |
| Tonfall: Sachlich, klar und direkt.""" |
|
|
|
|
| def summarize(transcript: str, diarized: str) -> str: |
| if not HF_TOKEN: |
| return "β οΈ HF_TOKEN fehlt β Zusammenfassung nicht mΓΆglich." |
|
|
| |
| text = diarized.strip() if diarized.strip() and not diarized.startswith("Diarisierung") else transcript.strip() |
|
|
| if not text: |
| return "β οΈ Kein Transkript vorhanden β bitte zuerst transkribieren." |
|
|
| try: |
| from huggingface_hub import InferenceClient |
| client = InferenceClient( |
| provider="novita", |
| api_key=HF_TOKEN, |
| ) |
| response = client.chat.completions.create( |
| model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", |
| messages=[ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": f"Hier ist das Transkript:\n\n{text}"}, |
| ], |
| max_tokens=2048, |
| ) |
| return response.choices[0].message.content |
| except Exception as e: |
| return f"β οΈ Zusammenfassung fehlgeschlagen: {e}" |
|
|
|
|
| |
|
|
| @spaces.GPU(duration=60) |
| def run_pipeline(audio_array, sample_rate, model_key, use_diar): |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| dtype = torch.float16 if device == "cuda" else torch.float32 |
|
|
| audio_16k = resample(audio_array, sample_rate, WHISPER_SR) |
| processor, model = get_asr(model_key, device, dtype) |
| raw_transcript, chunks = transcribe_audio(audio_16k, processor, model, device, dtype) |
|
|
| if not use_diar: |
| return raw_transcript, "" |
|
|
| try: |
| waveform = torch.tensor(audio_array).unsqueeze(0).float() |
| diar_input = {"waveform": waveform, "sample_rate": sample_rate} |
| diar = get_diar(device) |
| diarization = diar(diar_input) |
| segments = merge_with_speakers(chunks, diarization) |
| labeled = format_diarized(segments) |
| return raw_transcript, labeled or "(Keine Sprecher erkannt.)" |
| except EnvironmentError as e: |
| return raw_transcript, f"Fehler: {e}" |
| except Exception as e: |
| return raw_transcript, f"Diarisierung fehlgeschlagen: {e}" |
|
|
|
|
| |
|
|
| MAX_DURATION_S = 1200 |
|
|
| def transcribe(audio, model_key, use_diar): |
| if audio is None: |
| yield "Kein Audio eingegeben.", "" |
| return |
| sample_rate, audio_data = audio |
| if audio_data.ndim > 1: |
| audio_data = audio_data.mean(axis=1) |
| audio_data = audio_data.astype(np.float32) |
| if audio_data.max() > 1.0: |
| audio_data /= 32768.0 |
|
|
| duration_s = len(audio_data) / sample_rate |
| if duration_s > MAX_DURATION_S: |
| yield ( |
| f"Audio ist {duration_s:.0f}s lang β maximal {MAX_DURATION_S}s (20 Min.) erlaubt.", |
| "" |
| ) |
| return |
|
|
| yield "GPU wird angefordert ...", "" |
| transcript, labeled = run_pipeline(audio_data, sample_rate, model_key, use_diar) |
| yield transcript, labeled |
|
|
|
|
| |
|
|
| CSS = """ |
| :root { |
| --t-purple: #6264A7; |
| --t-purple-dark: #464775; |
| --t-purple-light: #E8EBFA; |
| --t-purple-mid: #9EA2D4; |
| --t-bg: #F0F2F8; |
| --t-card: #FFFFFF; |
| --t-text: #242424; |
| --t-muted: #616161; |
| --t-border: #E1E4F0; |
| } |
| |
| body, .gradio-container { |
| background: var(--t-bg) !important; |
| font-family: "Segoe UI", system-ui, -apple-system, sans-serif !important; |
| } |
| |
| .yapper-header { |
| background: linear-gradient(135deg, var(--t-purple-dark) 0%, var(--t-purple) 65%, var(--t-purple-mid) 100%); |
| border-radius: 12px; |
| padding: 24px 28px; |
| margin-bottom: 16px; |
| box-shadow: 0 4px 20px rgba(70,71,117,.28); |
| display: flex; |
| align-items: center; |
| gap: 18px; |
| color: #fff; |
| } |
| .yapper-header .icon { font-size: 2.8rem; line-height: 1; } |
| .yapper-header h1 { margin: 0 !important; font-size: 1.7rem !important; font-weight: 700 !important; letter-spacing: -.3px !important; color: #fff !important; } |
| .yapper-header p { margin: 4px 0 0 !important; font-size: .87rem !important; opacity: .9 !important; color: #fff !important; } |
| .yapper-header .badge { |
| margin-left: auto; |
| background: rgba(255,255,255,.2); |
| border-radius: 20px; |
| padding: 5px 14px; |
| font-size: .75rem; |
| font-weight: 600; |
| letter-spacing: .4px; |
| white-space: nowrap; |
| } |
| |
| .warn-box { |
| background: #FFF4CE; |
| border: 1px solid #F9C642; |
| border-left: 4px solid #F9C642; |
| border-radius: 6px; |
| padding: 10px 14px; |
| font-size: .85rem; |
| color: #7A5800; |
| margin-bottom: 12px; |
| } |
| .warn-box a { color: var(--t-purple); } |
| |
| label > span { |
| font-weight: 600 !important; |
| font-size: .77rem !important; |
| text-transform: uppercase !important; |
| letter-spacing: .5px !important; |
| color: var(--t-purple-dark) !important; |
| } |
| |
| input, select, textarea { |
| border-color: var(--t-border) !important; |
| border-radius: 6px !important; |
| } |
| input:focus, select:focus, textarea:focus { |
| border-color: var(--t-purple) !important; |
| box-shadow: 0 0 0 2px var(--t-purple-light) !important; |
| outline: none !important; |
| } |
| |
| button.primary { |
| background: var(--t-purple) !important; |
| border-radius: 6px !important; |
| border: none !important; |
| font-weight: 600 !important; |
| font-size: .95rem !important; |
| box-shadow: 0 2px 8px rgba(98,100,167,.35) !important; |
| transition: background .15s, box-shadow .15s !important; |
| } |
| button.primary:hover { |
| background: var(--t-purple-dark) !important; |
| box-shadow: 0 4px 16px rgba(98,100,167,.45) !important; |
| } |
| button.secondary { |
| background: var(--t-purple-light) !important; |
| color: var(--t-purple-dark) !important; |
| border: 1.5px solid var(--t-purple-mid) !important; |
| border-radius: 6px !important; |
| font-weight: 600 !important; |
| font-size: .95rem !important; |
| transition: background .15s !important; |
| } |
| button.secondary:hover { |
| background: var(--t-purple-mid) !important; |
| color: #fff !important; |
| } |
| |
| input[type=checkbox]:checked { accent-color: var(--t-purple) !important; } |
| |
| textarea { |
| font-size: .88rem !important; |
| line-height: 1.65 !important; |
| color: var(--t-text) !important; |
| background: #FAFBFF !important; |
| } |
| |
| .yapper-footer { |
| font-size: .75rem; |
| color: var(--t-muted); |
| border-top: 1px solid var(--t-border); |
| padding-top: 10px; |
| margin-top: 6px; |
| display: flex; |
| gap: 20px; |
| flex-wrap: wrap; |
| } |
| """ |
|
|
| |
|
|
| with gr.Blocks(title="Yapper (lite) - Meeting Transcriber") as demo: |
|
|
| gr.HTML(""" |
| <div class="yapper-header"> |
| <div class="icon">ποΈ</div> |
| <div> |
| <h1 style="margin:0;font-size:1.7rem;font-weight:700;color:#ffffff;letter-spacing:-.3px;">Yapper (lite)</h1> |
| <p style="margin:4px 0 0;font-size:.87rem;color:#ffffff;opacity:.9;">Transkription & Speaker-Diarisierung · fΓΌr eure Teams-Meetings</p> |
| </div> |
| <div class="badge">⚡ ZeroGPU</div> |
| </div> |
| """) |
|
|
| if not HF_TOKEN: |
| gr.HTML(""" |
| <div class="warn-box"> |
| <strong>⚠️ Kein HF_TOKEN gefunden.</strong> |
| Diarisierung ist deaktiviert – |
| fΓΌge das Token unter <em>Settings → Variables and secrets</em> als <code>HF_TOKEN</code> hinzu |
| und akzeptiere die Lizenzen fΓΌr |
| <a href="https://huggingface.co/pyannote/speaker-diarization-3.1" target="_blank">speaker-diarization-3.1</a> |
| und |
| <a href="https://huggingface.co/pyannote/segmentation-3.0" target="_blank">segmentation-3.0</a>. |
| </div> |
| """) |
|
|
| with gr.Row(equal_height=False): |
|
|
| with gr.Column(scale=1, min_width=300): |
| audio_input = gr.Audio( |
| sources=["microphone", "upload"], |
| type="numpy", |
| label="Audio-Eingabe", |
| ) |
| model_dd = gr.Dropdown( |
| choices=list(ASR_MODELS.keys()), |
| value="whisper-large-v3 (empfohlen)", |
| label="Transkriptionsmodell", |
| ) |
| lang_dd = gr.Dropdown( |
| choices=["de", "en", "auto"], |
| value="de", |
| label="Sprache", |
| ) |
| diar_cb = gr.Checkbox( |
| value=bool(HF_TOKEN), |
| label="Speaker-Diarisierung aktivieren (pyannote)", |
| interactive=bool(HF_TOKEN), |
| ) |
| run_btn = gr.Button("βΆ Transkribieren", variant="primary", size="lg") |
| sum_btn = gr.Button("π§ Zusammenfassen (Llama-4-Maverick)", variant="secondary", size="lg") |
|
|
| with gr.Column(scale=2): |
| transcript_out = gr.Textbox( |
| label="π Rohtranskript (Whisper)", |
| lines=8, |
| placeholder="Das Transkript erscheint hier nach der Verarbeitung ...", |
| ) |
| diar_out = gr.Textbox( |
| label="π₯ Sprecher-Transkript (pyannote)", |
| lines=8, |
| placeholder="Wird befΓΌllt wenn Diarisierung aktiviert ist.", |
| ) |
|
|
| |
| with gr.Row(): |
| with gr.Column(): |
| summary_out = gr.Markdown( |
| label="π Meeting-Zusammenfassung", |
| value="", |
| ) |
|
|
| gr.HTML(""" |
| <div class="yapper-footer"> |
| <span>⚡ ZeroGPU · H200</span> |
| <span>🎤 Whisper large-v3</span> |
| <span>👥 pyannote speaker-diarization-3.1</span> |
| <span>🧠 openai/Llama-4-Maverick</span> |
| <span>🕒 Quota: 1.500 Sek/Tag (PRO)</span> |
| </div> |
| """) |
|
|
| run_btn.click( |
| fn=transcribe, |
| inputs=[audio_input, model_dd, diar_cb], |
| outputs=[transcript_out, diar_out], |
| ) |
|
|
| sum_btn.click( |
| fn=summarize, |
| inputs=[transcript_out, diar_out], |
| outputs=[summary_out], |
| ) |
|
|
| demo.launch(css=CSS) |