File size: 17,541 Bytes
5f0a430
 
 
6f46074
 
 
ced1fa1
6f46074
 
 
 
 
 
 
5f0a430
 
 
6f46074
 
 
 
5f0a430
 
 
 
 
 
 
 
 
6f46074
 
 
 
5f0a430
6f46074
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f0a430
 
 
 
 
 
 
 
6f46074
 
 
 
 
 
 
5f0a430
 
6f46074
5f0a430
 
 
 
6f46074
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ced1fa1
 
 
 
 
6f46074
 
ced1fa1
6f46074
ced1fa1
6f46074
 
 
 
 
 
 
ced1fa1
6f46074
 
 
 
ced1fa1
6f46074
 
 
 
 
ced1fa1
6f46074
ced1fa1
 
 
 
 
 
 
 
6f46074
ced1fa1
6f46074
5f0a430
47b639c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
import logging
import os
import tempfile
from nltk.tokenize import sent_tokenize
import pickle
import re
from utils import clean_transcript, consolidate_similar_items, chunk_text, load_labels
import config
from models import load_diarization
import wave
import gc
import torch
import time


logger = logging.getLogger(__name__)

_diarize_model = None

def process_transcription(audio_content: bytes, whisper_model, enable_diarization=False): 
    start = time.time()
    if not whisper_model:
        raise ValueError("Whisper model not loaded.")

    temp_file_path = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
            temp_file_path = temp_file.name
            temp_file.write(audio_content)
    
        segments_gen, info = whisper_model.transcribe(temp_file_path, beam_size=5)

        segments = list(segments_gen)

        transcript = " ".join([seg.text.strip() for seg in segments])

        
        global _diarize_model

        if not enable_diarization:
            return transcript, info, None

        if _diarize_model is None:
            _diarize_model = load_diarization(config)
            
        if _diarize_model is None:
            logger.warning("Diarization model not available, returning transcript without speakers")
            return transcript, info, None
        
        with wave.open(temp_file_path, 'rb') as wav:
            frames = wav.getnframes()
            rate = wav.getframerate()
            #calcul audio duration
            audio_duration = frames / float(rate)


        if audio_duration < 3.0:
            logger.info(f"Audio too short ({audio_duration:.2f}s), skipping diarization")
            diarized_segments = [{"speaker": "SPEAKER_0", "text": transcript}]
            diarized_transcript = f"[SPEAKER_0]: {transcript}"
            return diarized_transcript, info, diarized_segments
        

        logger.info("Running speaker diarization")
        diarization = _diarize_model(temp_file_path)

        # Extract diarization segments
        diarize_segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            diarize_segments.append({
                "speaker": f"SPEAKER_{speaker.replace('SPEAKER_', '')}",
                "start": turn.start,
                "end": turn.end
            })
        
        diarized_segments = []

        for segment in segments:
            # Find best matching speaker based on time overlap
            best_speaker = None
            max_overlap = 0
            seg_start = segment.start
            seg_end = segment.end
            
            for diar_seg in diarize_segments:
                diar_start = diar_seg["start"]
                diar_end = diar_seg["end"]
                # Calculate overlap
                overlap_start = max(seg_start, diar_start)
                overlap_end = min(seg_end, diar_end)
                
                if overlap_end > overlap_start:
                    overlap = overlap_end - overlap_start
                    if overlap > max_overlap:
                        max_overlap = overlap
                        best_speaker = diar_seg["speaker"]
            
            # If no overlap found, assign to the closest speaker
            if best_speaker is None:
                min_distance = float('inf')
                for diar_seg in diarize_segments:
                    # Distance to start of segment
                    dist_start = abs(seg_start - diar_seg["start"])
                    # Distance to end of segment
                    dist_end = abs(seg_end - diar_seg["end"])
                    # Take the minimum
                    dist = min(dist_start, dist_end)
                    
                    if dist < min_distance:
                        min_distance = dist
                        best_speaker = diar_seg["speaker"]
            
            diarized_segments.append({
                "speaker": best_speaker or "SPEAKER_UNKNOWN",
                "text": segment.text,
                "start": segment.start,
                "end": segment.end
            })

        # Format diarized transcript
        diarized_transcript = ""
        current_speaker = None
        
        for segment in diarized_segments:
            speaker = segment["speaker"]
            text = segment["text"].strip()
            
            if not text:
                continue
                
            if speaker != current_speaker:
                diarized_transcript += f"\n[{speaker}]: {text}"
                current_speaker = speaker
            else:
                diarized_transcript += f" {text}"
        
        # Clean up memory
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        end = time.time()

        logger.info("time : ", (end - start) * 10**3)
        
        return diarized_transcript, info, diarized_segments

    finally:
        if temp_file_path and os.path.exists(temp_file_path):
            os.remove(temp_file_path)

def process_summary(text: str, summarizer_pipeline, nlp_spacy, config):
    if not summarizer_pipeline:
        raise ValueError("Summarizer model not loaded.")

    #clean transcript
    cleaned_transcript = clean_transcript(text)
    
    processed_text = cleaned_transcript
    
    doc = None

    if nlp_spacy:
        try:
            doc = nlp_spacy(processed_text)
            sentences = [sent.text.strip() for sent in doc.sents]
            processed_text = " ".join(sentences)
        except Exception as e:
             logger.error(f"SpaCy processing failed: {e}", exc_info=True)
    
    categories = {
        "meeting_title": [],
        "intro": [],
        "topics": [],
        "decisions": [],
        "action_items": [],
        "questions": [],
        "deadlines": [],
        "participants": [],
        "overall_summary": [],
        "conclusion": []
    }

    # extraction meeting title
    title_pattern = r'(meeting|call|session|discussion) (about|on|for|regarding) ([^.]+)'
    title_matches = re.findall(title_pattern, processed_text, re.IGNORECASE)

    if title_matches:
        categories["meeting_title"].append(title_matches[0][2].strip())

    if doc:
        sentences = [sent.text.strip() for sent in doc.sents]
    else:
        try:
            with open("/home/heymouad/nltk_data/tokenizers/punkt/english.pickle", "rb") as f:
                tokenizer = pickle.load(f)
            sentences = tokenizer.tokenize(processed_text)
        except Exception as e:
            logger.error(f"NLTK tokenization failed: {e}", exc_info=True)
            sentences = sent_tokenize(processed_text)
    
    # Find participants
    people = set()
    if doc:
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                person = ent.text.strip()
                if len(person) > 2:
                    people.add(person)
    
    if people:
        categories["participants"] = list(people)

    try:
        # chunked the text because of limits of bart model
        logger.info(processed_text[::100])
        processed_text = chunk_text(processed_text)
        parts_summaries = []
        
        for chunk in processed_text:
            result = summarizer_pipeline(chunk, max_length=150, min_length=30, do_sample=False)
            if result and isinstance(result, list) and len(result) > 0:
                part_summary = result[0].get('summary_text', '')
                if part_summary:
                    parts_summaries.append(part_summary)
        
        overall_summary = " ".join(parts_summaries) 
        overall_summary = summarizer_pipeline(overall_summary, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
        categories["overall_summary"] = [overall_summary]

    except Exception as e:
        logger.error(f"Summarization failed: {e}", exc_info=True)
        categories["overall_summary"] = ["Failed to generate overall summary."]
    
    # Process each sentence
    for i, sentence in enumerate(sentences):
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # Check for action items
        if (re.search(r'(need to|will|shall|must|should|have to|assigned to|responsible for|task|action item|to-do|follow up|take care of)', 
                      sentence, re.IGNORECASE) and 
            re.search(r'(we|you|I|they|he|she|team|group|department)', sentence, re.IGNORECASE)):

            categories["action_items"].append(sentence)
            continue
            
        # Check for decisions
        if re.search(r'(decided|agreed|conclusion|resolved|approved|rejected|consensus|finalized|confirmed|determined)', 
                    sentence, re.IGNORECASE):
            categories["decisions"].append(sentence)
            continue
            
        # Check for deadlines/timing with stronger patterns
        if re.search(r'(by|due|deadline|schedule|date|tomorrow|next week|month|calendar|remind|upcoming|on|at|until)', 
                    sentence, re.IGNORECASE) and re.search(r'(time|day|week|month|year|hour|minute)', sentence, re.IGNORECASE):
            categories["deadlines"].append(sentence)
            continue
            
        # Check for questions/issues
        if (re.search(r'(\?|issue|problem|concern|question|clarif|wonder|how|what|when|where|why|who)', 
                     sentence, re.IGNORECASE) and 
            not re.search(r'(answer|answered|resolved|solved)', sentence, re.IGNORECASE)):
            categories["questions"].append(sentence)
            continue
            
        # Check for intro statements
        if i < len(sentences) // 10:  # First 10% of sentences
            if re.search(r'(welcome|begin|start|agenda|today|discuss|meeting|introduce|opening|good morning|hello|topic)', 
                        sentence, re.IGNORECASE):
                categories["intro"].append(sentence)
                continue
                
        # Check for conclusion statements
        if i > len(sentences) * 9 // 10:  # Last 10% of sentences
            if re.search(r'(conclude|end|wrap|summary|thank|next meeting|follow up|adjourn|goodbye|bye|closing)', 
                        sentence, re.IGNORECASE):
                categories["conclusion"].append(sentence)
                continue
                
    # Everything else is considered a topic if it has substance
    if len(sentence.split()) > 3:  # Avoid very short sentences
        categories["topics"].append(sentence)
            
    # Process categories to avoid repetition and consolidate related points
    for category in categories:
        if category in ["topics", "action_items", "decisions", "questions", "deadlines"]:
            categories[category] = consolidate_similar_items(categories[category])
    
    # Limit the number of topics to avoid overwhelming
    if len(categories["topics"]) > 10:
        # If we have a summarizer, try to generate a summary of topics
        try:
            topics_text = " ".join(categories["topics"])
            topics_summary = summarizer_pipeline(topics_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
            categories["topics"] = sent_tokenize(topics_summary)
        except Exception as e:
            logger.error(f"Topics summarization failed: {e}", exc_info=True)
            # Otherwise just take the first few and last few topics
            categories["topics"] = categories["topics"][:5] + categories["topics"][-5:]
    
    # Add emojis to formatted output
    formatted_summary = []
    
   # Format meeting title if available
    if categories.get("meeting_title"):
        formatted_summary.append(f"πŸ“ **Meeting Title:** {categories['meeting_title'][0]}")
        formatted_summary.append("")
    
    # Add overall summary
    if categories.get("overall_summary"):
        formatted_summary.append("πŸ“‹ **Executive Summary:**")
        formatted_summary.append(categories["overall_summary"][0])
        formatted_summary.append("")
    
    # Format participants
    if categories["participants"]:
        formatted_summary.append("πŸ‘₯ **Participants:**")
        formatted_summary.append(", ".join(categories["participants"]))
        formatted_summary.append("")
    
    # Format intro
    if categories["intro"]:
        formatted_summary.append("🎯 **Meeting Introduction:**")
        formatted_summary.append(" ".join(categories["intro"]))
        formatted_summary.append("")
    
    # Format main topics
    if categories["topics"]:
        formatted_summary.append("πŸ’‘ **Key Topics:**")
        for i, topic in enumerate(categories["topics"], 1):
            formatted_summary.append(f"{i}. {topic}")
        formatted_summary.append("")
    
    # Format decisions
    if categories["decisions"]:
        formatted_summary.append("βœ… **Decisions Made:**")
        for decision in categories["decisions"]:
            formatted_summary.append(f"β€’ {decision}")
        formatted_summary.append("")
    
    # Format action items
    if categories["action_items"]:
        formatted_summary.append("πŸ“‹ **Action Items:**")
        for item in categories["action_items"]:
            formatted_summary.append(f"β€’ {item}")
        formatted_summary.append("")
    
    # Format questions
    if categories["questions"]:
        formatted_summary.append("❓ **Questions & Concerns:**")
        for question in categories["questions"]:
            formatted_summary.append(f"β€’ {question}")
        formatted_summary.append("")
    
    # Format deadlines
    if categories["deadlines"]:
        formatted_summary.append("⏰ **Deadlines & Timing:**")
        for deadline in categories["deadlines"]:
            formatted_summary.append(f"β€’ {deadline}")
        formatted_summary.append("")
    
    # Format conclusion
    if categories["conclusion"]:
        formatted_summary.append("🏁 **Conclusion:**")
        formatted_summary.append(" ".join(categories["conclusion"]))
    
    return "\n".join(formatted_summary)


def create_enhanced_summary_prompt(transcript: str, language_name: str) -> str:
    """
    Creates a single, dynamic and insistent prompt that instructs the AI 
    to output its findings in the specified language.
    """
    return f"""
    You are an expert AI assistant. Your task is to analyze the following meeting transcript and extract key information into a structured JSON object.

    **Primary Goal:** Analyze the provided transcript and generate a structured summary.

    **CRITICAL LANGUAGE INSTRUCTION:** All text in your final JSON response must be written in the following language: **{language_name}**. There are no exceptions.

    **ANALYSIS INSTRUCTIONS:**
    1.  Read the entire transcript to understand its context.
    2.  Identify a concise title for the meeting.
    3.  Identify all participants mentioned.
    4.  Write a brief paragraph summarizing the core themes and outcomes.
    5.  List all clear and agreed-upon decisions.
    6.  Extract all clear action items, identifying the task, who it was assigned to, the due date if mentioned, and the context.

    **OUTPUT INSTRUCTIONS:**
    - Respond ONLY with a valid JSON object.
    - The JSON must use these exact keys: "meeting_title", "participants", "meeting_summary", "decisions_made", "action_items".
    - **Language Check:** Before you finalize your response, verify that every single string value within the JSON is written in **{language_name}**.

    **TRANSCRIPT TO ANALYZE:**
    \"\"\"
    {transcript}
    \"\"\"
    """


def format_summary_to_markdown(summary_json: dict, language_code: str = "en") -> str:
    """Converts the structured JSON summary using labels from language files."""
    # Load the correct set of labels
    labels = load_labels(language_code)
    
    summary_data = {k.lower().replace(" ", "_"): v for k, v in summary_json.items()}

    # #### Use the loaded labels for all user facing text
    if not summary_data.get("meeting_summary") and not summary_data.get("decisions_made") and not summary_data.get("action_items"):
        return labels["no_content"]

    markdown_parts = []

    if title := summary_data.get("meeting_title"):
        markdown_parts.append(f"### {title}\n")

    if summary := summary_data.get("meeting_summary"):
        markdown_parts.append(labels["summary_header"])
        markdown_parts.append(summary)
        markdown_parts.append("")

    if decisions := summary_data.get("decisions_made"):
        markdown_parts.append(labels["decisions_header"])
        for decision in decisions:
            markdown_parts.append(f"- {decision}")
        markdown_parts.append("")

    if action_items := summary_data.get("action_items"):
        markdown_parts.append(labels["actions_header"])
        for item in action_items:
            task = item.get('task', 'N/A')
            assigned_to = item.get('assigned_to', labels["not_specified"])
            due_date = item.get('due_date', labels["not_specified"])
            context = item.get('context', '')

            markdown_parts.append(f"- **{labels['task_label']}**: {task}")
            markdown_parts.append(f"  - **{labels['assigned_to_label']}**: {assigned_to}")
            markdown_parts.append(f"  - **{labels['due_date_label']}**: {due_date}")
            if context:
                markdown_parts.append(f"  - **{labels['context_label']}**: {context}")
        markdown_parts.append("")

    return "\n".join(markdown_parts)