Spaces:
Sleeping
Sleeping
File size: 3,553 Bytes
0cf3992 b97d874 0cf3992 af99d45 0cf3992 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import re
from core.state import AgenticState
from loguru import logger
def ts_to_seconds(ts: str) -> int:
m, s = ts.split(":")
return int(m) * 60 + int(s)
@logger.catch
async def node_3_transcript_cleaning_and_normalization(state: AgenticState) -> AgenticState:
"""
Node 3: Transcript Cleaning & Normalization
"""
logger.info("🚀 Node 3: Cleaning transcript...")
raw_text = state.raw_transcript_text
if not raw_text:
state.errors.append(
{"type": "missing_transcript", "message": """
No raw transcript from Node 2.
YouTube blocks IP addresses from cloud providers (Hugging Face Spaces, Streamlit Cloud, etc.).
💡 Solution: Run this app locally with:
```streamlit run app.py```
or run:
```docker-compose up -d``` """}
)
logger.error("""No raw transcript from Node 2. """)
return state
cleaned = raw_text
# Remove noise
cleaned = re.sub(r"\[(music|applause|laughter)\]", "", cleaned, flags=re.IGNORECASE)
# Fix repeated punctuation
cleaned = re.sub(r"[.!?]{2,}", ".", cleaned)
# Common ASR corrections
fixes = {
"gonna": "going to",
"wanna": "want to",
"kinda": "kind of",
"ya": "you",
}
for wrong, right in fixes.items():
cleaned = re.sub(rf"\b{wrong}\b", right, cleaned, flags=re.IGNORECASE)
lines = cleaned.split("\n")
cleaned_lines = []
timestamp_map = []
speaker_segments = []
current_speaker = "Unknown"
segment_start = 0
timestamp_pattern = re.compile(r"\[(\d+:\d+)\s*-\s*(\d+:\d+)\]")
for i, line in enumerate(lines):
line = line.strip()
if not line:
continue
ts_match = timestamp_pattern.match(line)
if ts_match:
start_ts = ts_match.group(1)
end_ts = ts_match.group(2)
start_sec = ts_to_seconds(start_ts)
end_sec = ts_to_seconds(end_ts)
timestamp_map.append(
{
"start": start_sec,
"end": end_sec,
"pretty": f"{start_ts}-{end_ts}",
}
)
line = line[ts_match.end():].strip()
speaker_match = re.match(r"([A-Z][a-zA-Z ]{2,}):", line)
if speaker_match:
speaker = speaker_match.group(1).strip()
if speaker != current_speaker:
speaker_segments.append(
{
"speaker": current_speaker,
"start_line": segment_start,
"end_line": i - 1,
}
)
current_speaker = speaker
segment_start = i
line = line[speaker_match.end():].strip()
cleaned_lines.append(line)
if cleaned_lines:
speaker_segments.append(
{
"speaker": current_speaker,
"start_line": segment_start,
"end_line": len(cleaned_lines) - 1,
}
)
cleaned_transcript = "\n".join(cleaned_lines)
state.cleaned_transcript = cleaned_transcript
state.cleaned_timestamp_map = timestamp_map
state.speaker_segments = speaker_segments
logger.info(
"✅ Node 3 complete | chars={char_count} | segments={segment_count}",
char_count=len(cleaned_transcript),
segment_count=len(speaker_segments)
)
return state
|