Spaces:

devml33
/

awalit

Sleeping

File size: 17,541 Bytes

import logging
import os
import tempfile
from nltk.tokenize import sent_tokenize
import pickle
import re
from utils import clean_transcript, consolidate_similar_items, chunk_text, load_labels
import config
from models import load_diarization
import wave
import gc
import torch
import time


logger = logging.getLogger(__name__)

_diarize_model = None

def process_transcription(audio_content: bytes, whisper_model, enable_diarization=False): 
    start = time.time()
    if not whisper_model:
        raise ValueError("Whisper model not loaded.")

    temp_file_path = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
            temp_file_path = temp_file.name
            temp_file.write(audio_content)
    
        segments_gen, info = whisper_model.transcribe(temp_file_path, beam_size=5)

        segments = list(segments_gen)

        transcript = " ".join([seg.text.strip() for seg in segments])

        
        global _diarize_model

        if not enable_diarization:
            return transcript, info, None

        if _diarize_model is None:
            _diarize_model = load_diarization(config)
            
        if _diarize_model is None:
            logger.warning("Diarization model not available, returning transcript without speakers")
            return transcript, info, None
        
        with wave.open(temp_file_path, 'rb') as wav:
            frames = wav.getnframes()
            rate = wav.getframerate()
            #calcul audio duration
            audio_duration = frames / float(rate)


        if audio_duration < 3.0:
            logger.info(f"Audio too short ({audio_duration:.2f}s), skipping diarization")
            diarized_segments = [{"speaker": "SPEAKER_0", "text": transcript}]
            diarized_transcript = f"[SPEAKER_0]: {transcript}"
            return diarized_transcript, info, diarized_segments
        

        logger.info("Running speaker diarization")
        diarization = _diarize_model(temp_file_path)

        # Extract diarization segments
        diarize_segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            diarize_segments.append({
                "speaker": f"SPEAKER_{speaker.replace('SPEAKER_', '')}",
                "start": turn.start,
                "end": turn.end
            })
        
        diarized_segments = []

        for segment in segments:
            # Find best matching speaker based on time overlap
            best_speaker = None
            max_overlap = 0
            seg_start = segment.start
            seg_end = segment.end
            
            for diar_seg in diarize_segments:
                diar_start = diar_seg["start"]
                diar_end = diar_seg["end"]
                # Calculate overlap
                overlap_start = max(seg_start, diar_start)
                overlap_end = min(seg_end, diar_end)
                
                if overlap_end > overlap_start:
                    overlap = overlap_end - overlap_start
                    if overlap > max_overlap:
                        max_overlap = overlap
                        best_speaker = diar_seg["speaker"]
            
            # If no overlap found, assign to the closest speaker
            if best_speaker is None:
                min_distance = float('inf')
                for diar_seg in diarize_segments:
                    # Distance to start of segment
                    dist_start = abs(seg_start - diar_seg["start"])
                    # Distance to end of segment
                    dist_end = abs(seg_end - diar_seg["end"])
                    # Take the minimum
                    dist = min(dist_start, dist_end)
                    
                    if dist < min_distance:
                        min_distance = dist
                        best_speaker = diar_seg["speaker"]
            
            diarized_segments.append({
                "speaker": best_speaker or "SPEAKER_UNKNOWN",
                "text": segment.text,
                "start": segment.start,
                "end": segment.end
            })

        # Format diarized transcript
        diarized_transcript = ""
        current_speaker = None
        
        for segment in diarized_segments:
            speaker = segment["speaker"]
            text = segment["text"].strip()
            
            if not text:
                continue
                
            if speaker != current_speaker:
                diarized_transcript += f"\n[{speaker}]: {text}"
                current_speaker = speaker
            else:
                diarized_transcript += f" {text}"
        
        # Clean up memory
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        end = time.time()

        logger.info("time : ", (end - start) * 10**3)
        
        return diarized_transcript, info, diarized_segments

    finally:
        if temp_file_path and os.path.exists(temp_file_path):
            os.remove(temp_file_path)

def process_summary(text: str, summarizer_pipeline, nlp_spacy, config):
    if not summarizer_pipeline:
        raise ValueError("Summarizer model not loaded.")

    #clean transcript
    cleaned_transcript = clean_transcript(text)
    
    processed_text = cleaned_transcript
    
    doc = None

    if nlp_spacy:
        try:
            doc = nlp_spacy(processed_text)
            sentences = [sent.text.strip() for sent in doc.sents]
            processed_text = " ".join(sentences)
        except Exception as e:
             logger.error(f"SpaCy processing failed: {e}", exc_info=True)
    
    categories = {
        "meeting_title": [],
        "intro": [],
        "topics": [],
        "decisions": [],
        "action_items": [],
        "questions": [],
        "deadlines": [],
        "participants": [],
        "overall_summary": [],
        "conclusion": []
    }

    # extraction meeting title
    title_pattern = r'(meeting|call|session|discussion) (about|on|for|regarding) ([^.]+)'
    title_matches = re.findall(title_pattern, processed_text, re.IGNORECASE)

    if title_matches:
        categories["meeting_title"].append(title_matches[0][2].strip())

    if doc:
        sentences = [sent.text.strip() for sent in doc.sents]
    else:
        try:
            with open("/home/heymouad/nltk_data/tokenizers/punkt/english.pickle", "rb") as f:
                tokenizer = pickle.load(f)
            sentences = tokenizer.tokenize(processed_text)
        except Exception as e:
            logger.error(f"NLTK tokenization failed: {e}", exc_info=True)
            sentences = sent_tokenize(processed_text)
    
    # Find participants
    people = set()
    if doc:
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                person = ent.text.strip()
                if len(person) > 2:
                    people.add(person)
    
    if people:
        categories["participants"] = list(people)

    try:
        # chunked the text because of limits of bart model
        logger.info(processed_text[::100])
        processed_text = chunk_text(processed_text)
        parts_summaries = []
        
        for chunk in processed_text:
            result = summarizer_pipeline(chunk, max_length=150, min_length=30, do_sample=False)
            if result and isinstance(result, list) and len(result) > 0:
                part_summary = result[0].get('summary_text', '')
                if part_summary:
                    parts_summaries.append(part_summary)
        
        overall_summary = " ".join(parts_summaries) 
        overall_summary = summarizer_pipeline(overall_summary, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
        categories["overall_summary"] = [overall_summary]

    except Exception as e:
        logger.error(f"Summarization failed: {e}", exc_info=True)
        categories["overall_summary"] = ["Failed to generate overall summary."]
    
    # Process each sentence
    for i, sentence in enumerate(sentences):
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # Check for action items
        if (re.search(r'(need to|will|shall|must|should|have to|assigned to|responsible for|task|action item|to-do|follow up|take care of)', 
                      sentence, re.IGNORECASE) and 
            re.search(r'(we|you|I|they|he|she|team|group|department)', sentence, re.IGNORECASE)):

            categories["action_items"].append(sentence)
            continue
            
        # Check for decisions
        if re.search(r'(decided|agreed|conclusion|resolved|approved|rejected|consensus|finalized|confirmed|determined)', 
                    sentence, re.IGNORECASE):
            categories["decisions"].append(sentence)
            continue
            
        # Check for deadlines/timing with stronger patterns
        if re.search(r'(by|due|deadline|schedule|date|tomorrow|next week|month|calendar|remind|upcoming|on|at|until)', 
                    sentence, re.IGNORECASE) and re.search(r'(time|day|week|month|year|hour|minute)', sentence, re.IGNORECASE):
            categories["deadlines"].append(sentence)
            continue
            
        # Check for questions/issues
        if (re.search(r'(\?|issue|problem|concern|question|clarif|wonder|how|what|when|where|why|who)', 
                     sentence, re.IGNORECASE) and 
            not re.search(r'(answer|answered|resolved|solved)', sentence, re.IGNORECASE)):
            categories["questions"].append(sentence)
            continue
            
        # Check for intro statements
        if i < len(sentences) // 10:  # First 10% of sentences
            if re.search(r'(welcome|begin|start|agenda|today|discuss|meeting|introduce|opening|good morning|hello|topic)', 
                        sentence, re.IGNORECASE):
                categories["intro"].append(sentence)
                continue
                
        # Check for conclusion statements
        if i > len(sentences) * 9 // 10:  # Last 10% of sentences
            if re.search(r'(conclude|end|wrap|summary|thank|next meeting|follow up|adjourn|goodbye|bye|closing)', 
                        sentence, re.IGNORECASE):
                categories["conclusion"].append(sentence)
                continue
                
    # Everything else is considered a topic if it has substance
    if len(sentence.split()) > 3:  # Avoid very short sentences
        categories["topics"].append(sentence)
            
    # Process categories to avoid repetition and consolidate related points
    for category in categories:
        if category in ["topics", "action_items", "decisions", "questions", "deadlines"]:
            categories[category] = consolidate_similar_items(categories[category])
    
    # Limit the number of topics to avoid overwhelming
    if len(categories["topics"]) > 10:
        # If we have a summarizer, try to generate a summary of topics
        try:
            topics_text = " ".join(categories["topics"])
            topics_summary = summarizer_pipeline(topics_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
            categories["topics"] = sent_tokenize(topics_summary)
        except Exception as e:
            logger.error(f"Topics summarization failed: {e}", exc_info=True)
            # Otherwise just take the first few and last few topics
            categories["topics"] = categories["topics"][:5] + categories["topics"][-5:]
    
    # Add emojis to formatted output
    formatted_summary = []
    
   # Format meeting title if available
    if categories.get("meeting_title"):
        formatted_summary.append(f"📝 **Meeting Title:** {categories['meeting_title'][0]}")
        formatted_summary.append("")
    
    # Add overall summary
    if categories.get("overall_summary"):
        formatted_summary.append("📋 **Executive Summary:**")
        formatted_summary.append(categories["overall_summary"][0])
        formatted_summary.append("")
    
    # Format participants
    if categories["participants"]:
        formatted_summary.append("👥 **Participants:**")
        formatted_summary.append(", ".join(categories["participants"]))
        formatted_summary.append("")
    
    # Format intro
    if categories["intro"]:
        formatted_summary.append("🎯 **Meeting Introduction:**")
        formatted_summary.append(" ".join(categories["intro"]))
        formatted_summary.append("")
    
    # Format main topics
    if categories["topics"]:
        formatted_summary.append("💡 **Key Topics:**")
        for i, topic in enumerate(categories["topics"], 1):
            formatted_summary.append(f"{i}. {topic}")
        formatted_summary.append("")
    
    # Format decisions
    if categories["decisions"]:
        formatted_summary.append("✅ **Decisions Made:**")
        for decision in categories["decisions"]:
            formatted_summary.append(f"• {decision}")
        formatted_summary.append("")
    
    # Format action items
    if categories["action_items"]:
        formatted_summary.append("📋 **Action Items:**")
        for item in categories["action_items"]:
            formatted_summary.append(f"• {item}")
        formatted_summary.append("")
    
    # Format questions
    if categories["questions"]:
        formatted_summary.append("❓ **Questions & Concerns:**")
        for question in categories["questions"]:
            formatted_summary.append(f"• {question}")
        formatted_summary.append("")
    
    # Format deadlines
    if categories["deadlines"]:
        formatted_summary.append("⏰ **Deadlines & Timing:**")
        for deadline in categories["deadlines"]:
            formatted_summary.append(f"• {deadline}")
        formatted_summary.append("")
    
    # Format conclusion
    if categories["conclusion"]:
        formatted_summary.append("🏁 **Conclusion:**")
        formatted_summary.append(" ".join(categories["conclusion"]))
    
    return "\n".join(formatted_summary)


def create_enhanced_summary_prompt(transcript: str, language_name: str) -> str:
    """
    Creates a single, dynamic and insistent prompt that instructs the AI 
    to output its findings in the specified language.
    """
    return f"""
    You are an expert AI assistant. Your task is to analyze the following meeting transcript and extract key information into a structured JSON object.

    **Primary Goal:** Analyze the provided transcript and generate a structured summary.

    **CRITICAL LANGUAGE INSTRUCTION:** All text in your final JSON response must be written in the following language: **{language_name}**. There are no exceptions.

    **ANALYSIS INSTRUCTIONS:**
    1.  Read the entire transcript to understand its context.
    2.  Identify a concise title for the meeting.
    3.  Identify all participants mentioned.
    4.  Write a brief paragraph summarizing the core themes and outcomes.
    5.  List all clear and agreed-upon decisions.
    6.  Extract all clear action items, identifying the task, who it was assigned to, the due date if mentioned, and the context.

    **OUTPUT INSTRUCTIONS:**
    - Respond ONLY with a valid JSON object.
    - The JSON must use these exact keys: "meeting_title", "participants", "meeting_summary", "decisions_made", "action_items".
    - **Language Check:** Before you finalize your response, verify that every single string value within the JSON is written in **{language_name}**.

    **TRANSCRIPT TO ANALYZE:**
    \"\"\"
    {transcript}
    \"\"\"
    """


def format_summary_to_markdown(summary_json: dict, language_code: str = "en") -> str:
    """Converts the structured JSON summary using labels from language files."""
    # Load the correct set of labels
    labels = load_labels(language_code)
    
    summary_data = {k.lower().replace(" ", "_"): v for k, v in summary_json.items()}

    # #### Use the loaded labels for all user facing text
    if not summary_data.get("meeting_summary") and not summary_data.get("decisions_made") and not summary_data.get("action_items"):
        return labels["no_content"]

    markdown_parts = []

    if title := summary_data.get("meeting_title"):
        markdown_parts.append(f"### {title}\n")

    if summary := summary_data.get("meeting_summary"):
        markdown_parts.append(labels["summary_header"])
        markdown_parts.append(summary)
        markdown_parts.append("")

    if decisions := summary_data.get("decisions_made"):
        markdown_parts.append(labels["decisions_header"])
        for decision in decisions:
            markdown_parts.append(f"- {decision}")
        markdown_parts.append("")

    if action_items := summary_data.get("action_items"):
        markdown_parts.append(labels["actions_header"])
        for item in action_items:
            task = item.get('task', 'N/A')
            assigned_to = item.get('assigned_to', labels["not_specified"])
            due_date = item.get('due_date', labels["not_specified"])
            context = item.get('context', '')

            markdown_parts.append(f"- **{labels['task_label']}**: {task}")
            markdown_parts.append(f"  - **{labels['assigned_to_label']}**: {assigned_to}")
            markdown_parts.append(f"  - **{labels['due_date_label']}**: {due_date}")
            if context:
                markdown_parts.append(f"  - **{labels['context_label']}**: {context}")
        markdown_parts.append("")

    return "\n".join(markdown_parts)