File size: 3,553 Bytes
0cf3992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b97d874
 
 
 
 
 
 
0cf3992
af99d45
0cf3992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
from core.state import AgenticState
from loguru import logger



def ts_to_seconds(ts: str) -> int:
    m, s = ts.split(":")
    return int(m) * 60 + int(s)



@logger.catch
async def node_3_transcript_cleaning_and_normalization(state: AgenticState) -> AgenticState:
    """
    Node 3: Transcript Cleaning & Normalization
    """

    logger.info("🚀 Node 3: Cleaning transcript...")

    raw_text = state.raw_transcript_text

    if not raw_text:
        state.errors.append(
            {"type": "missing_transcript", "message": """
            No raw transcript from Node 2. 
            YouTube blocks IP addresses from cloud providers (Hugging Face Spaces, Streamlit Cloud, etc.).
            💡 Solution: Run this app locally with: 
                ```streamlit run app.py``` 
            or run: 
                ```docker-compose up -d``` """}
        )
        logger.error("""No raw transcript from Node 2. """)

        return state

    cleaned = raw_text

    # Remove noise
    cleaned = re.sub(r"\[(music|applause|laughter)\]", "", cleaned, flags=re.IGNORECASE)

    # Fix repeated punctuation
    cleaned = re.sub(r"[.!?]{2,}", ".", cleaned)

    # Common ASR corrections
    fixes = {
        "gonna": "going to",
        "wanna": "want to",
        "kinda": "kind of",
        "ya": "you",
    }

    for wrong, right in fixes.items():
        cleaned = re.sub(rf"\b{wrong}\b", right, cleaned, flags=re.IGNORECASE)

    lines = cleaned.split("\n")

    cleaned_lines = []
    timestamp_map = []
    speaker_segments = []

    current_speaker = "Unknown"
    segment_start = 0

    timestamp_pattern = re.compile(r"\[(\d+:\d+)\s*-\s*(\d+:\d+)\]")

    for i, line in enumerate(lines):

        line = line.strip()

        if not line:
            continue

        ts_match = timestamp_pattern.match(line)

        if ts_match:

            start_ts = ts_match.group(1)
            end_ts = ts_match.group(2)

            start_sec = ts_to_seconds(start_ts)
            end_sec = ts_to_seconds(end_ts)

            timestamp_map.append(
                {
                    "start": start_sec,
                    "end": end_sec,
                    "pretty": f"{start_ts}-{end_ts}",
                }
            )

            line = line[ts_match.end():].strip()

        speaker_match = re.match(r"([A-Z][a-zA-Z ]{2,}):", line)

        if speaker_match:

            speaker = speaker_match.group(1).strip()

            if speaker != current_speaker:

                speaker_segments.append(
                    {
                        "speaker": current_speaker,
                        "start_line": segment_start,
                        "end_line": i - 1,
                    }
                )

                current_speaker = speaker
                segment_start = i

            line = line[speaker_match.end():].strip()

        cleaned_lines.append(line)

    if cleaned_lines:
        speaker_segments.append(
            {
                "speaker": current_speaker,
                "start_line": segment_start,
                "end_line": len(cleaned_lines) - 1,
            }
        )

    cleaned_transcript = "\n".join(cleaned_lines)

    state.cleaned_transcript = cleaned_transcript
    state.cleaned_timestamp_map = timestamp_map
    state.speaker_segments = speaker_segments

    logger.info(
        "✅ Node 3 complete | chars={char_count} | segments={segment_count}",
        char_count=len(cleaned_transcript),
        segment_count=len(speaker_segments)
    )

    return state