yapper / app.py
aidn's picture
Update app.py
0b6a282 verified
import os
import re
import numpy as np
import torch
import spaces
import gradio as gr
# ── Konfiguration ──────────────────────────────────────────────────────────────
HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_SR = 16_000
ASR_MODELS = {
"whisper-small (schnell)": "openai/whisper-small",
"whisper-medium": "openai/whisper-medium",
"whisper-large-v3 (empfohlen)": "openai/whisper-large-v3",
}
_asr_cache: dict = {}
_diar_pipe = None
# ── Model Loading ──────────────────────────────────────────────────────────────
def get_asr(model_key: str, device: str, dtype: torch.dtype):
from transformers import AutoProcessor, WhisperForConditionalGeneration
model_id = ASR_MODELS[model_key]
if model_id not in _asr_cache:
processor = AutoProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(
model_id, torch_dtype=dtype,
).to(device)
model.eval()
_asr_cache[model_id] = (processor, model)
return _asr_cache[model_id]
def get_diar(device: str):
global _diar_pipe
if _diar_pipe is None:
if not HF_TOKEN:
raise EnvironmentError(
"HF_TOKEN nicht gesetzt. FΓΌge ihn in den Space-Settings unter "
"'Settings -> Variables and secrets' hinzu."
)
from pyannote.audio import Pipeline as PyannotePipeline
_diar_pipe = PyannotePipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
token=HF_TOKEN,
)
if device == "cuda":
_diar_pipe = _diar_pipe.to(torch.device("cuda"))
return _diar_pipe
# ── Audio-Hilfsfunktionen ──────────────────────────────────────────────────────
def resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
if orig_sr == target_sr:
return audio
new_len = int(len(audio) * target_sr / orig_sr)
return np.interp(
np.linspace(0, len(audio) - 1, new_len),
np.arange(len(audio)),
audio,
).astype(np.float32)
def chunk_audio(audio: np.ndarray, sr: int, chunk_s: int = 28) -> list:
chunk_len = sr * chunk_s
if len(audio) <= chunk_len:
return [audio]
chunks, step = [], sr * (chunk_s - 2)
for start in range(0, len(audio), step):
chunks.append(audio[start: start + chunk_len])
return chunks
# ── Transkriptions-Logik ───────────────────────────────────────────────────────
def transcribe_audio(audio_16k, processor, model, device, dtype):
chunks = chunk_audio(audio_16k, WHISPER_SR)
full_text, all_chunks, offset = [], [], 0.0
for chunk in chunks:
inputs = processor(chunk, sampling_rate=WHISPER_SR, return_tensors="pt")
input_features = inputs.input_features.to(device=device, dtype=dtype)
with torch.no_grad():
predicted_ids = model.generate(
input_features,
return_timestamps=True,
language="de",
)
result = processor.batch_decode(
predicted_ids,
decode_with_timestamps=True,
skip_special_tokens=False,
)[0]
result = re.sub(r"<\|(?![\d.]+\|)[^>]+\|>", "", result).strip()
ts_pattern = re.compile(r"<\|([\d.]+)\|>")
tokens = ts_pattern.split(result)
segment_start = offset
for token in tokens:
try:
segment_start = offset + float(token)
except ValueError:
text = token.strip()
if text:
all_chunks.append({"timestamp": (segment_start, segment_start + 1.0), "text": text})
full_text.append(text)
offset += len(chunk) / WHISPER_SR
return " ".join(full_text).strip(), all_chunks
# ── Speaker-Diarisierung ───────────────────────────────────────────────────────
def unwrap_diarization(result):
"""Robust gegen verschiedene pyannote RΓΌckgabetypen (Annotation, DiarizeOutput, NamedTuple...)."""
# Schon eine Annotation? Fertig.
if hasattr(result, "itertracks"):
return result
# Attribute-basiert (DiarizeOutput, SlidingWindowFeature, ...)
for attr in ("speaker_diarization", "exclusive_speaker_diarization", "diarization", "annotation", "output"):
val = getattr(result, attr, None)
if val is not None and hasattr(val, "itertracks"):
return val
# NamedTuple: erstes Feld mit itertracks nehmen
if hasattr(result, "_fields"):
for val in result:
if hasattr(val, "itertracks"):
return val
# Dict-artiger Zugriff
for key in ("diarization", "annotation"):
try:
val = result[key]
if hasattr(val, "itertracks"):
return val
except (KeyError, TypeError, IndexError):
pass
# Letzter Ausweg: einfach zurΓΌckgeben und hoffen
return result
def merge_with_speakers(chunks, diarization):
diarization = unwrap_diarization(diarization)
merged = []
for chunk in chunks:
start, end = chunk["timestamp"]
end = end or (start + 1.0)
best_speaker, best_overlap = "Unbekannt", 0.0
for turn, _, speaker in diarization.itertracks(yield_label=True):
overlap = max(0.0, min(end, turn.end) - max(start, turn.start))
if overlap > best_overlap:
best_overlap, best_speaker = overlap, speaker
merged.append((start, end, best_speaker, chunk["text"].strip()))
return merged
def format_diarized(segments):
if not segments:
return ""
lines, cur_speaker, cur_start, cur_texts = [], None, 0.0, []
for start, _end, speaker, text in segments:
if speaker != cur_speaker:
if cur_speaker is not None:
lines.append(f"[{cur_speaker}] {cur_start:.1f}s\n{' '.join(cur_texts)}")
cur_speaker, cur_start, cur_texts = speaker, start, [text]
else:
cur_texts.append(text)
if cur_speaker and cur_texts:
lines.append(f"[{cur_speaker}] {cur_start:.1f}s\n{' '.join(cur_texts)}")
return "\n\n".join(lines)
# ── Zusammenfassung ───────────────────────────────────────────────────────────
SYSTEM_PROMPT = """Du bist ein strukturierter technischer Projektmanager und Assistent. Deine Aufgabe ist es, das folgende Transkript eines Entwickler-Sync-Calls (z.B. aus Microsoft Teams) prΓ€zise und ΓΌbersichtlich zusammenzufassen.
Besonderheiten des Transkripts:
* Es kann sich um rohe, unstrukturierte Sprache mit vielen FΓΌllwΓΆrtern ("Ja", "Genau", "Γ„hm") handeln.
* Das Transkript kann einseitig sein (nur ein Sprecher wurde aufgenommen). Falls das der Fall ist, rekonstruiere den fehlenden Kontext logisch aus den Antworten des aufgenommenen Sprechers.
* GedankensprΓΌnge sind normal. BΓΌndele die Informationen thematisch, nicht chronologisch.
GewΓΌnschtes Ausgabeformat (in Markdown):
Bitte strukturiere deine Antwort in die folgenden Kategorien. Lasse Kategorien weg, falls es im Text keine passenden Informationen dazu gibt.
* 🎯 Kernpunkte & Entscheidungen: Was war der Hauptgrund des GesprÀchs? Welche Entscheidungen wurden getroffen?
* πŸ’» Code & Technik: Welche Repositories, Branches, Tools (z.B. SonarQube, Maven) oder spezifischen technischen Probleme wurden besprochen?
* βœ… Action Items (To-Dos): Wer macht was als NΓ€chstes? Bitte ordne die Aufgaben klar zu (z.B. "Sprecher 1 kΓΌmmert sich um...", "Kollege soll Feedback geben zu...").
* πŸ“… Orga, Termine & Dailys: Wurden Meetings verschoben? Gibt es Absprachen fΓΌr das nΓ€chste Daily oder private/teaminterne Events?
Tonfall: Sachlich, klar und direkt."""
def summarize(transcript: str, diarized: str) -> str:
if not HF_TOKEN:
return "⚠️ HF_TOKEN fehlt – Zusammenfassung nicht mΓΆglich."
# Nutze diarisiertes Transkript wenn verfΓΌgbar, sonst Roh-Transkript
text = diarized.strip() if diarized.strip() and not diarized.startswith("Diarisierung") else transcript.strip()
if not text:
return "⚠️ Kein Transkript vorhanden – bitte zuerst transkribieren."
try:
from huggingface_hub import InferenceClient
client = InferenceClient(
provider="novita",
api_key=HF_TOKEN,
)
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Hier ist das Transkript:\n\n{text}"},
],
max_tokens=2048,
)
return response.choices[0].message.content
except Exception as e:
return f"⚠️ Zusammenfassung fehlgeschlagen: {e}"
# ── Haupt-Pipeline ─────────────────────────────────────────────────────────────
@spaces.GPU(duration=60)
def run_pipeline(audio_array, sample_rate, model_key, use_diar):
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
audio_16k = resample(audio_array, sample_rate, WHISPER_SR)
processor, model = get_asr(model_key, device, dtype)
raw_transcript, chunks = transcribe_audio(audio_16k, processor, model, device, dtype)
if not use_diar:
return raw_transcript, ""
try:
waveform = torch.tensor(audio_array).unsqueeze(0).float()
diar_input = {"waveform": waveform, "sample_rate": sample_rate}
diar = get_diar(device)
diarization = diar(diar_input)
segments = merge_with_speakers(chunks, diarization)
labeled = format_diarized(segments)
return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
except EnvironmentError as e:
return raw_transcript, f"Fehler: {e}"
except Exception as e:
return raw_transcript, f"Diarisierung fehlgeschlagen: {e}"
# ── Gradio-Handler ────────────────────────────────────────────────────────────
MAX_DURATION_S = 1200 # 20 Min. Audio
def transcribe(audio, model_key, use_diar):
if audio is None:
yield "Kein Audio eingegeben.", ""
return
sample_rate, audio_data = audio
if audio_data.ndim > 1:
audio_data = audio_data.mean(axis=1)
audio_data = audio_data.astype(np.float32)
if audio_data.max() > 1.0:
audio_data /= 32768.0
duration_s = len(audio_data) / sample_rate
if duration_s > MAX_DURATION_S:
yield (
f"Audio ist {duration_s:.0f}s lang – maximal {MAX_DURATION_S}s (20 Min.) erlaubt.",
""
)
return
yield "GPU wird angefordert ...", ""
transcript, labeled = run_pipeline(audio_data, sample_rate, model_key, use_diar)
yield transcript, labeled
# ── Teams CSS ─────────────────────────────────────────────────────────────────
CSS = """
:root {
--t-purple: #6264A7;
--t-purple-dark: #464775;
--t-purple-light: #E8EBFA;
--t-purple-mid: #9EA2D4;
--t-bg: #F0F2F8;
--t-card: #FFFFFF;
--t-text: #242424;
--t-muted: #616161;
--t-border: #E1E4F0;
}
body, .gradio-container {
background: var(--t-bg) !important;
font-family: "Segoe UI", system-ui, -apple-system, sans-serif !important;
}
.yapper-header {
background: linear-gradient(135deg, var(--t-purple-dark) 0%, var(--t-purple) 65%, var(--t-purple-mid) 100%);
border-radius: 12px;
padding: 24px 28px;
margin-bottom: 16px;
box-shadow: 0 4px 20px rgba(70,71,117,.28);
display: flex;
align-items: center;
gap: 18px;
color: #fff;
}
.yapper-header .icon { font-size: 2.8rem; line-height: 1; }
.yapper-header h1 { margin: 0 !important; font-size: 1.7rem !important; font-weight: 700 !important; letter-spacing: -.3px !important; color: #fff !important; }
.yapper-header p { margin: 4px 0 0 !important; font-size: .87rem !important; opacity: .9 !important; color: #fff !important; }
.yapper-header .badge {
margin-left: auto;
background: rgba(255,255,255,.2);
border-radius: 20px;
padding: 5px 14px;
font-size: .75rem;
font-weight: 600;
letter-spacing: .4px;
white-space: nowrap;
}
.warn-box {
background: #FFF4CE;
border: 1px solid #F9C642;
border-left: 4px solid #F9C642;
border-radius: 6px;
padding: 10px 14px;
font-size: .85rem;
color: #7A5800;
margin-bottom: 12px;
}
.warn-box a { color: var(--t-purple); }
label > span {
font-weight: 600 !important;
font-size: .77rem !important;
text-transform: uppercase !important;
letter-spacing: .5px !important;
color: var(--t-purple-dark) !important;
}
input, select, textarea {
border-color: var(--t-border) !important;
border-radius: 6px !important;
}
input:focus, select:focus, textarea:focus {
border-color: var(--t-purple) !important;
box-shadow: 0 0 0 2px var(--t-purple-light) !important;
outline: none !important;
}
button.primary {
background: var(--t-purple) !important;
border-radius: 6px !important;
border: none !important;
font-weight: 600 !important;
font-size: .95rem !important;
box-shadow: 0 2px 8px rgba(98,100,167,.35) !important;
transition: background .15s, box-shadow .15s !important;
}
button.primary:hover {
background: var(--t-purple-dark) !important;
box-shadow: 0 4px 16px rgba(98,100,167,.45) !important;
}
button.secondary {
background: var(--t-purple-light) !important;
color: var(--t-purple-dark) !important;
border: 1.5px solid var(--t-purple-mid) !important;
border-radius: 6px !important;
font-weight: 600 !important;
font-size: .95rem !important;
transition: background .15s !important;
}
button.secondary:hover {
background: var(--t-purple-mid) !important;
color: #fff !important;
}
input[type=checkbox]:checked { accent-color: var(--t-purple) !important; }
textarea {
font-size: .88rem !important;
line-height: 1.65 !important;
color: var(--t-text) !important;
background: #FAFBFF !important;
}
.yapper-footer {
font-size: .75rem;
color: var(--t-muted);
border-top: 1px solid var(--t-border);
padding-top: 10px;
margin-top: 6px;
display: flex;
gap: 20px;
flex-wrap: wrap;
}
"""
# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Yapper (lite) - Meeting Transcriber") as demo:
gr.HTML("""
<div class="yapper-header">
<div class="icon">πŸŽ™οΈ</div>
<div>
<h1 style="margin:0;font-size:1.7rem;font-weight:700;color:#ffffff;letter-spacing:-.3px;">Yapper (lite)</h1>
<p style="margin:4px 0 0;font-size:.87rem;color:#ffffff;opacity:.9;">Transkription &amp; Speaker-Diarisierung &nbsp;&middot;&nbsp; fΓΌr eure Teams-Meetings</p>
</div>
<div class="badge">&#x26A1; ZeroGPU</div>
</div>
""")
if not HF_TOKEN:
gr.HTML("""
<div class="warn-box">
<strong>&#x26A0;&#xFE0F; Kein HF_TOKEN gefunden.</strong>
Diarisierung ist deaktiviert &ndash;
fΓΌge das Token unter <em>Settings &rarr; Variables and secrets</em> als <code>HF_TOKEN</code> hinzu
und akzeptiere die Lizenzen fΓΌr
<a href="https://huggingface.co/pyannote/speaker-diarization-3.1" target="_blank">speaker-diarization-3.1</a>
und
<a href="https://huggingface.co/pyannote/segmentation-3.0" target="_blank">segmentation-3.0</a>.
</div>
""")
with gr.Row(equal_height=False):
with gr.Column(scale=1, min_width=300):
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Audio-Eingabe",
)
model_dd = gr.Dropdown(
choices=list(ASR_MODELS.keys()),
value="whisper-large-v3 (empfohlen)",
label="Transkriptionsmodell",
)
lang_dd = gr.Dropdown(
choices=["de", "en", "auto"],
value="de",
label="Sprache",
)
diar_cb = gr.Checkbox(
value=bool(HF_TOKEN),
label="Speaker-Diarisierung aktivieren (pyannote)",
interactive=bool(HF_TOKEN),
)
run_btn = gr.Button("β–Ά Transkribieren", variant="primary", size="lg")
sum_btn = gr.Button("🧠 Zusammenfassen (Llama-4-Maverick)", variant="secondary", size="lg")
with gr.Column(scale=2):
transcript_out = gr.Textbox(
label="πŸ“ Rohtranskript (Whisper)",
lines=8,
placeholder="Das Transkript erscheint hier nach der Verarbeitung ...",
)
diar_out = gr.Textbox(
label="πŸ‘₯ Sprecher-Transkript (pyannote)",
lines=8,
placeholder="Wird befΓΌllt wenn Diarisierung aktiviert ist.",
)
# ── Zusammenfassung ──
with gr.Row():
with gr.Column():
summary_out = gr.Markdown(
label="πŸ“‹ Meeting-Zusammenfassung",
value="",
)
gr.HTML("""
<div class="yapper-footer">
<span>&#x26A1; ZeroGPU &middot; H200</span>
<span>&#x1F3A4; Whisper large-v3</span>
<span>&#x1F465; pyannote speaker-diarization-3.1</span>
<span>&#x1F9E0; openai/Llama-4-Maverick</span>
<span>&#x1F552; Quota: 1.500 Sek/Tag (PRO)</span>
</div>
""")
run_btn.click(
fn=transcribe,
inputs=[audio_input, model_dd, diar_cb],
outputs=[transcript_out, diar_out],
)
sum_btn.click(
fn=summarize,
inputs=[transcript_out, diar_out],
outputs=[summary_out],
)
demo.launch(css=CSS)