Spaces:

devsu
/

meeting-summarizer

Sleeping

devsu commited on Oct 21

Commit

164d23a

1 Parent(s): ca50df8

Add initial implementation of Meeting Summarizer web app

- Created main application file `app.py` for meeting analysis and summarization using Gradio.
- Added utility modules for text extraction, audio transcription, PDF generation, and data persistence.
- Implemented logging for better debugging and error handling.
- Included a `.gitignore` file to exclude unnecessary files and directories.
- Updated `README.md` to reflect the new features and usage instructions.
- Added `requirements.txt` for dependency management.

Files changed (10) hide show

.gitignore +164 -0
README.md +7 -5
app.py +326 -0
requirements.txt +12 -0
utils/__init__.py +1 -0
utils/data_persistence.py +206 -0
utils/llm_analysis.py +144 -0
utils/pdf_generator.py +184 -0
utils/text_extraction.py +105 -0
utils/transcription.py +125 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Meeting Summarizer specific
+data/
+meetings/
+*.pdf
+*.mp3
+*.wav
+*.m4a
+*.flac
+*.ogg
+temp/
+tmp/
+# Hugging Face cache
+.cache/
+huggingface/
+# Model cache
+models/
+*.bin
+*.safetensors
+# Logs
+*.log
+logs/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
 title: Meeting Summarizer
-emoji: 🔥
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Meeting Summarizer
+emoji: 🎯
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: "5.49.1"
 app_file: app.py
 pinned: false
 ---
+# Meeting Summarizer
+An interactive web app with Gradio that allows you to upload a meeting transcript or audio/video file and automatically generates a complete summary, topics list, and keywords.

app.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+Meeting Summarizer - Applicazione Gradio
+Web app per l'analisi e sintesi automatica di meeting tramite GPT-4o-mini.
+"""
+import os
+import tempfile
+import shutil
+import gradio as gr
+from typing import Tuple, Optional
+# Import moduli locali
+from utils.text_extraction import extract_text, get_supported_extensions
+from utils.transcription import transcribe_audio, is_audio_file, get_supported_audio_extensions
+from utils.llm_analysis import analyze_meeting, format_analysis_for_display
+from utils.pdf_generator import generate_pdf, cleanup_temp_pdf
+from utils.data_persistence import save_meeting_to_dataset
+# Configurazione logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Variabili globali per file temporanei
+_temp_files = []
+def process_meeting(file, api_key: str, hf_token: str = "") -> Tuple[str, str, str, str, str]:
+    """
+    Processa un file di meeting e restituisce l'analisi completa.
+    Args:
+        file: File caricato dall'utente
+        api_key (str): Chiave API OpenAI
+        hf_token (str): Token Hugging Face (opzionale)
+    Returns:
+        Tuple[str, str, str, str, str]: (summary, topics, keywords, pdf_path, message)
+    """
+    global _temp_files
+    try:
+        # Verifica input e debug
+        logger.info(f"DEBUG: file ricevuto tipo={type(file)} valore={file}")
+        if not file:
+            return "", "", "", None, "❌ Error: No file uploaded"
+        if not api_key:
+            return "", "", "", None, "❌ Error: OpenAI API key required"
+        # Gestisci l'oggetto file di Gradio
+        # Nelle versioni recenti di Gradio, il file può essere una stringa (path) o un oggetto
+        if isinstance(file, str):
+            file_path = file
+        else:
+            # Se è un oggetto file, estrai il path
+            file_path = file.name if hasattr(file, 'name') else str(file)
+        logger.info(f"DEBUG: file_path={file_path} exists={os.path.exists(file_path)} isfile={os.path.isfile(file_path) if os.path.exists(file_path) else 'N/A'}")
+        # Verify that the file exists and is a file (not a directory)
+        if not os.path.exists(file_path):
+            return "", "", "", None, "❌ Error: File not found or invalid"
+        if not os.path.isfile(file_path):
+            return "", "", "", None, f"❌ Error: Path is a directory, not a file: {file_path}"
+        # Estrai testo dal file
+        text = ""
+        file_name = os.path.basename(file_path)
+        logger.info(f"Processamento file: {file_name}")
+        logger.info(f"Percorso file: {file_path}")
+        # Determine if it's an audio file
+        if is_audio_file(file_name):
+            logger.info("Audio file detected, starting transcription...")
+            text = transcribe_audio(file_path)
+            if not text:
+                return "", "", "", None, "❌ Error: Transcription failed"
+            logger.info("Transcription completed")
+        else:
+            # Extract text from document
+            logger.info("Document file detected, extracting text...")
+            text = extract_text(file_path)
+            if not text:
+                return "", "", "", None, "❌ Error: Text extraction failed"
+            logger.info("Text extraction completed")
+        # Verify that the text is not empty
+        if not text.strip():
+            return "", "", "", None, "❌ Error: No text extracted from file"
+        # Analyze with GPT-4o-mini
+        logger.info("Starting analysis with GPT-4o-mini...")
+        analysis = analyze_meeting(text, api_key)
+        if not analysis:
+            return "", "", "", None, "❌ Error: Analysis failed"
+        # Formatta per display
+        formatted_analysis = format_analysis_for_display(analysis)
+        # Generate PDF
+        logger.info("Generating PDF...")
+        pdf_path = generate_pdf(analysis)
+        # Debug: verify that pdf_path is valid
+        if pdf_path:
+            logger.info(f"PDF generated: {pdf_path}")
+            logger.info(f"PDF exists: {os.path.exists(pdf_path)}")
+            logger.info(f"PDF is a file: {os.path.isfile(pdf_path)}")
+            # If not a valid file, set to None
+            if not os.path.isfile(pdf_path):
+                logger.warning(f"PDF path invalid: {pdf_path}")
+                pdf_path = None
+        # Save to dataset if token provided
+        if hf_token:
+            logger.info("Saving to Hugging Face Dataset...")
+            meeting_data = {
+                "file_name": file_name,
+                "transcription": text,
+                "summary": analysis.get("summary", ""),
+                "topics": analysis.get("topics", []),
+                "keywords": analysis.get("keywords", [])
+            }
+            if save_meeting_to_dataset(meeting_data, hf_token):
+                logger.info("Meeting saved to HF Dataset")
+            else:
+                logger.warning("Saving to HF Dataset failed")
+        # Add PDF to temporary files for cleanup
+        if pdf_path:
+            _temp_files.append(pdf_path)
+        # Success message
+        success_msg = f"✅ Meeting analyzed successfully!\n\n📄 File: {file_name}\n📝 Characters analyzed: {len(text)}\n📊 Topics identified: {len(analysis.get('topics', []))}\n🔑 Keywords: {len(analysis.get('keywords', []))}"
+        if hf_token:
+            success_msg += "\n💾 Data saved to Hugging Face Dataset"
+        return (
+            formatted_analysis["summary"],
+            formatted_analysis["topics"],
+            formatted_analysis["keywords"],
+            pdf_path if pdf_path else None,
+            success_msg
+        )
+    except Exception as e:
+        logger.error(f"Error during processing: {str(e)}")
+        return "", "", "", None, f"❌ Error: {str(e)}"
+def cleanup_temp_files():
+    """Clean up temporary files."""
+    global _temp_files
+    for file_path in _temp_files:
+        if os.path.exists(file_path):
+            try:
+                os.remove(file_path)
+            except Exception as e:
+                logger.warning(f"Unable to delete {file_path}: {str(e)}")
+    _temp_files.clear()
+def create_interface():
+    """Create the Gradio interface."""
+    # Supported extensions
+    supported_extensions = get_supported_extensions() + get_supported_audio_extensions()
+    with gr.Blocks(
+        title="Meeting Summarizer",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            max-width: 1200px !important;
+        }
+        .success-message {
+            background-color: #d4edda;
+            border: 1px solid #c3e6cb;
+            color: #155724;
+            padding: 10px;
+            border-radius: 5px;
+        }
+        """
+    ) as app:
+        gr.Markdown(
+            """
+            # 🎯 Meeting Summarizer
+            Upload a meeting file (audio, PDF, DOCX, TXT) and automatically get:
+            - 📝 **Complete summary** of the meeting
+            - 🏷️ **Main topics** discussed
+            - 🔑 **Relevant keywords**
+            - 📄 **Downloadable PDF** with all results
+            ---
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Input file
+                file_input = gr.File(
+                    label="📁 Upload Meeting File",
+                    file_types=supported_extensions,
+                    file_count="single"
+                )
+                # API Key OpenAI
+                api_key_input = gr.Textbox(
+                    label="🔑 OpenAI API Key",
+                    placeholder="Enter your OpenAI API key...",
+                    type="password",
+                    info="Required for analysis with GPT-4o-mini"
+                )
+                # HF Token (optional)
+                hf_token_input = gr.Textbox(
+                    label="🤗 Hugging Face Token (Optional)",
+                    placeholder="Enter your HF token to save data...",
+                    type="password",
+                    info="Optional: to save results to Hugging Face Dataset"
+                )
+                # Analyze button
+                analyze_btn = gr.Button(
+                    "🚀 Analyze Meeting",
+                    variant="primary",
+                    size="lg"
+                )
+                # Status message
+                status_msg = gr.Textbox(
+                    label="📊 Status",
+                    interactive=False,
+                    visible=True
+                )
+            with gr.Column(scale=2):
+                # Output summary
+                summary_output = gr.Markdown(
+                    label="📝 Meeting Summary",
+                    value="The summary will appear here after analysis..."
+                )
+                # Output topics
+                topics_output = gr.Markdown(
+                    label="🏷️ Main Topics",
+                    value="The main topics will appear here..."
+                )
+                # Output keywords
+                keywords_output = gr.Markdown(
+                    label="🔑 Keywords",
+                    value="The keywords will appear here..."
+                )
+                # Download PDF
+                pdf_download = gr.File(
+                    label="📄 Download PDF Report",
+                    visible=True
+                )
+        # Footer
+        gr.Markdown(
+            """
+            ---
+            ### ℹ️ Information
+            **Supported formats:**
+            - 🎵 **Audio**: MP3, WAV, M4A, FLAC, OGG
+            - 📄 **Documents**: PDF, DOCX, TXT
+            **Features:**
+            - 🎤 Automatic audio transcription with Whisper
+            - 🤖 Intelligent analysis with GPT-4o-mini
+            - 📊 Topic and keyword extraction
+            - 💾 Save to Hugging Face Datasets
+            - 📄 Professional PDF generation
+            **Notes:**
+            - Audio files are automatically transcribed
+            - Analysis is optimized for meetings
+            - Data is saved only if you provide an HF token
+            """
+        )
+        # Eventi
+        analyze_btn.click(
+            fn=process_meeting,
+            inputs=[file_input, api_key_input, hf_token_input],
+            outputs=[summary_output, topics_output, keywords_output, pdf_download, status_msg],
+            show_progress=True
+        )
+        # Cleanup al chiudere
+        app.unload(cleanup_temp_files)
+    return app
+def main():
+    """Main function."""
+    logger.info("Starting Meeting Summarizer...")
+    # Create interface
+    app = create_interface()
+    # Launch server
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True
+    )
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio>=4.0.0
+openai>=1.0.0
+transformers>=4.30.0
+torch>=2.0.0
+torchaudio>=2.0.0
+pypdf2>=3.0.0
+python-docx>=0.8.11
+reportlab>=4.0.0
+datasets>=2.14.0
+huggingface-hub>=0.16.0
+accelerate>=0.20.0
+librosa==0.11.0

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Meeting Summarizer Utils Package

utils/data_persistence.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Module for saving meeting data to Hugging Face Datasets.
+Manages permanent persistence of analysis results.
+"""
+import json
+import uuid
+from datetime import datetime
+from typing import Dict, Optional
+try:
+    from datasets import Dataset
+    from huggingface_hub import HfApi, login
+except ImportError:
+    Dataset = None
+    HfApi = None
+    login = None
+# Configurazione logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Nome del dataset su Hugging Face
+DATASET_NAME = "meeting-summarizer-data"
+def save_meeting_to_dataset(meeting_data: Dict, hf_token: Optional[str] = None) -> bool:
+    """
+    Save meeting data to Hugging Face Dataset.
+    Args:
+        meeting_data (Dict): Meeting data to save
+        hf_token (Optional[str]): Hugging Face token (optional)
+    Returns:
+        bool: True if saved successfully, False otherwise
+    """
+    if not meeting_data:
+        logger.error("Meeting data not provided")
+        return False
+    if Dataset is None:
+        logger.error("datasets not installed. Install with: pip install datasets")
+        return False
+    try:
+        # Authentication if token provided
+        if hf_token:
+            try:
+                login(token=hf_token)
+                logger.info("Hugging Face authentication completed")
+            except Exception as e:
+                logger.warning(f"Error in HF authentication: {str(e)}")
+                logger.info("Continuing without authentication...")
+        # Prepare data for saving
+        meeting_record = _prepare_meeting_record(meeting_data)
+        # Create or load dataset
+        dataset = _get_or_create_dataset()
+        # Add new record
+        if dataset is None:
+            logger.error("Unable to create or load dataset")
+            return False
+        # Convert dataset to list to add record
+        records = list(dataset)
+        records.append(meeting_record)
+        # Create new dataset with added record
+        new_dataset = Dataset.from_list(records)
+        # Push to Hugging Face Hub (if authenticated)
+        if hf_token:
+            try:
+                new_dataset.push_to_hub(
+                    DATASET_NAME,
+                    private=True,
+                    token=hf_token
+                )
+                logger.info(f"Dataset updated on Hugging Face Hub: {DATASET_NAME}")
+            except Exception as e:
+                logger.warning(f"Unable to push to HF Hub: {str(e)}")
+                logger.info("Data saved locally")
+        logger.info("Meeting saved successfully to dataset")
+        return True
+    except Exception as e:
+        logger.error(f"Error while saving meeting: {str(e)}")
+        return False
+def _prepare_meeting_record(meeting_data: Dict) -> Dict:
+    """
+    Prepare meeting record for saving.
+    Args:
+        meeting_data (Dict): Meeting data
+    Returns:
+        Dict: Record formatted for dataset
+    """
+    current_time = datetime.now()
+    return {
+        "id": str(uuid.uuid4()),
+        "file_name": meeting_data.get("file_name", "unknown"),
+        "meeting_date": current_time.strftime("%Y-%m-%d"),
+        "transcription": meeting_data.get("transcription", ""),
+        "summary": meeting_data.get("summary", ""),
+        "topics": json.dumps(meeting_data.get("topics", [])),
+        "keywords": json.dumps(meeting_data.get("keywords", [])),
+        "created_at": current_time.isoformat()
+    }
+def _get_or_create_dataset() -> Optional[Dataset]:
+    """
+    Create or load Hugging Face dataset.
+    Returns:
+        Optional[Dataset]: Dataset or None if error
+    """
+    try:
+        # Try to load existing dataset
+        try:
+            dataset = Dataset.from_hub(DATASET_NAME)
+            logger.info(f"Existing dataset loaded: {DATASET_NAME}")
+            return dataset
+        except Exception:
+            logger.info(f"Dataset {DATASET_NAME} not found, creating new dataset...")
+        # Create new empty dataset
+        empty_dataset = Dataset.from_dict({
+            "id": [],
+            "file_name": [],
+            "meeting_date": [],
+            "transcription": [],
+            "summary": [],
+            "topics": [],
+            "keywords": [],
+            "created_at": []
+        })
+        logger.info(f"New dataset created: {DATASET_NAME}")
+        return empty_dataset
+    except Exception as e:
+        logger.error(f"Error in creating/loading dataset: {str(e)}")
+        return None
+def load_meetings_from_dataset(hf_token: Optional[str] = None) -> Optional[list]:
+    """
+    Load all meetings from dataset.
+    Args:
+        hf_token (Optional[str]): Hugging Face token
+    Returns:
+        Optional[list]: List of meetings or None if error
+    """
+    if Dataset is None:
+        logger.error("datasets not installed")
+        return None
+    try:
+        # Authentication if token provided
+        if hf_token:
+            try:
+                login(token=hf_token)
+            except Exception as e:
+                logger.warning(f"Error in HF authentication: {str(e)}")
+        # Load dataset
+        dataset = Dataset.from_hub(DATASET_NAME)
+        # Convert to list
+        meetings = list(dataset)
+        logger.info(f"Loaded {len(meetings)} meetings from dataset")
+        return meetings
+    except Exception as e:
+        logger.error(f"Error loading meetings: {str(e)}")
+        return None
+def get_dataset_info() -> Dict:
+    """
+    Return dataset information.
+    Returns:
+        Dict: Dataset information
+    """
+    return {
+        "dataset_name": DATASET_NAME,
+        "description": "Dataset for persisting analyzed meetings",
+        "fields": [
+            "id", "file_name", "meeting_date", "transcription",
+            "summary", "topics", "keywords", "created_at"
+        ]
+    }

utils/llm_analysis.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Module for analyzing meeting text using GPT-4o-mini.
+Extracts summary, topics and keywords from text.
+"""
+import json
+import logging
+from typing import Dict, List, Optional
+try:
+    from openai import OpenAI
+except ImportError:
+    OpenAI = None
+# Configurazione logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def analyze_meeting(text: str, api_key: str) -> Optional[Dict]:
+    """
+    Analyze meeting text using GPT-4o-mini.
+    Args:
+        text (str): Meeting text to analyze
+        api_key (str): OpenAI API key
+    Returns:
+        Optional[Dict]: Dictionary with summary, topics, keywords or None if error
+    """
+    if not text or not text.strip():
+        logger.error("Empty text provided for analysis")
+        return None
+    if not api_key:
+        logger.error("OpenAI API key not provided")
+        return None
+    if OpenAI is None:
+        logger.error("OpenAI not installed. Install with: pip install openai")
+        return None
+    try:
+        # Initialize OpenAI client
+        client = OpenAI(api_key=api_key)
+        # Structured prompt for analysis
+        prompt = f"""
+Analyze the following meeting text and provide a response in JSON format with the following keys:
+1. "summary": A comprehensive and detailed summary of the meeting (minimum 200 words)
+2. "topics": A list of 5-8 main topics discussed in the meeting
+3. "keywords": A list of 10-15 relevant keywords
+Meeting text:
+{text}
+Respond ONLY with the requested JSON, without any additional text.
+"""
+        logger.info("Sending request to GPT-4o-mini...")
+        # API call
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are an expert assistant in meeting analysis. Always provide responses in valid JSON format."},
+                {"role": "user", "content": prompt}
+            ],
+            max_tokens=2000,
+            temperature=0.3
+        )
+        # Extract response content
+        content = response.choices[0].message.content.strip()
+        # Clean content from any markdown or extra text
+        if content.startswith("```json"):
+            content = content[7:]
+        if content.endswith("```"):
+            content = content[:-3]
+        # Parse JSON
+        try:
+            result = json.loads(content)
+            # Structure validation
+            required_keys = ["summary", "topics", "keywords"]
+            if not all(key in result for key in required_keys):
+                logger.error("Invalid JSON structure: missing keys")
+                return None
+            # Type validation
+            if not isinstance(result["summary"], str):
+                logger.error("Summary must be a string")
+                return None
+            if not isinstance(result["topics"], list):
+                logger.error("Topics must be a list")
+                return None
+            if not isinstance(result["keywords"], list):
+                logger.error("Keywords must be a list")
+                return None
+            logger.info("Analysis completed successfully")
+            return result
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON parsing error: {str(e)}")
+            logger.error(f"Received content: {content}")
+            return None
+    except Exception as e:
+        logger.error(f"Error during meeting analysis: {str(e)}")
+        return None
+def format_analysis_for_display(analysis: Dict) -> Dict[str, str]:
+    """
+    Format analysis for display in Gradio.
+    Args:
+        analysis (Dict): Analysis result
+    Returns:
+        Dict[str, str]: Dictionary formatted for display
+    """
+    if not analysis:
+        return {
+            "summary": "Error in analysis",
+            "topics": "Error in analysis",
+            "keywords": "Error in analysis"
+        }
+    # Format topics as markdown list
+    topics_md = "\n".join([f"- {topic}" for topic in analysis.get("topics", [])])
+    # Format keywords as markdown list
+    keywords_md = "\n".join([f"- {keyword}" for keyword in analysis.get("keywords", [])])
+    return {
+        "summary": analysis.get("summary", "Summary not available"),
+        "topics": topics_md,
+        "keywords": keywords_md
+    }

utils/pdf_generator.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+Module for generating PDF reports with meeting analysis results.
+Uses reportlab to create well-formatted documents.
+"""
+import os
+import tempfile
+from datetime import datetime
+from typing import Dict, Optional
+try:
+    from reportlab.lib.pagesizes import A4
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.lib.units import inch
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
+    from reportlab.lib import colors
+except ImportError:
+    A4 = None
+    getSampleStyleSheet = None
+    ParagraphStyle = None
+    inch = None
+    SimpleDocTemplate = None
+    Paragraph = None
+    Spacer = None
+    PageBreak = None
+    colors = None
+# Configurazione logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def generate_pdf(meeting_data: Dict) -> Optional[str]:
+    """
+    Generate a PDF with meeting analysis results.
+    Args:
+        meeting_data (Dict): Meeting data with summary, topics, keywords
+    Returns:
+        Optional[str]: Path to generated PDF file or None if error
+    """
+    if not meeting_data:
+        logger.error("Meeting data not provided")
+        return None
+    if SimpleDocTemplate is None:
+        logger.error("reportlab not installed. Install with: pip install reportlab")
+        return None
+    try:
+        # Create temporary file
+        temp_dir = tempfile.gettempdir()
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        pdf_filename = f"meeting_summary_{timestamp}.pdf"
+        pdf_path = os.path.join(temp_dir, pdf_filename)
+        # Create PDF document
+        doc = SimpleDocTemplate(
+            pdf_path,
+            pagesize=A4,
+            rightMargin=72,
+            leftMargin=72,
+            topMargin=72,
+            bottomMargin=18
+        )
+        # Custom styles
+        styles = getSampleStyleSheet()
+        # Title style
+        title_style = ParagraphStyle(
+            'CustomTitle',
+            parent=styles['Heading1'],
+            fontSize=18,
+            spaceAfter=30,
+            alignment=1,  # Centered
+            textColor=colors.darkblue
+        )
+        # Section style
+        section_style = ParagraphStyle(
+            'CustomSection',
+            parent=styles['Heading2'],
+            fontSize=14,
+            spaceAfter=12,
+            spaceBefore=20,
+            textColor=colors.darkblue
+        )
+        # Normal text style
+        normal_style = ParagraphStyle(
+            'CustomNormal',
+            parent=styles['Normal'],
+            fontSize=11,
+            spaceAfter=6,
+            leading=14
+        )
+        # List style
+        list_style = ParagraphStyle(
+            'CustomList',
+            parent=styles['Normal'],
+            fontSize=11,
+            spaceAfter=3,
+            leftIndent=20,
+            bulletIndent=10
+        )
+        # Build content
+        story = []
+        # Title
+        story.append(Paragraph("Meeting Summary", title_style))
+        story.append(Spacer(1, 12))
+        # Date and info
+        current_date = datetime.now().strftime("%m/%d/%Y %H:%M")
+        story.append(Paragraph(f"<b>Analysis date:</b> {current_date}", normal_style))
+        story.append(Spacer(1, 20))
+        # Summary
+        story.append(Paragraph("Summary", section_style))
+        summary_text = meeting_data.get("summary", "Summary not available")
+        story.append(Paragraph(summary_text, normal_style))
+        story.append(Spacer(1, 20))
+        # Main topics
+        story.append(Paragraph("Main Topics", section_style))
+        topics = meeting_data.get("topics", [])
+        if topics:
+            for topic in topics:
+                story.append(Paragraph(f"• {topic}", list_style))
+        else:
+            story.append(Paragraph("Topics not available", normal_style))
+        story.append(Spacer(1, 20))
+        # Keywords
+        story.append(Paragraph("Keywords", section_style))
+        keywords = meeting_data.get("keywords", [])
+        if keywords:
+            # Group keywords in rows of 3-4
+            keyword_lines = []
+            for i in range(0, len(keywords), 4):
+                line_keywords = keywords[i:i+4]
+                keyword_lines.append(" • ".join(line_keywords))
+            for line in keyword_lines:
+                story.append(Paragraph(f"• {line}", list_style))
+        else:
+            story.append(Paragraph("Keywords not available", normal_style))
+        # Footer
+        story.append(Spacer(1, 30))
+        story.append(Paragraph(
+            f"<i>Automatically generated on {current_date} by Meeting Summarizer</i>",
+            ParagraphStyle('Footer', parent=styles['Normal'], fontSize=9, alignment=1)
+        ))
+        # Generate PDF
+        doc.build(story)
+        logger.info(f"PDF generated successfully: {pdf_path}")
+        return pdf_path
+    except Exception as e:
+        logger.error(f"Error during PDF generation: {str(e)}")
+        return None
+def cleanup_temp_pdf(pdf_path: str) -> None:
+    """
+    Clean up temporary PDF file.
+    Args:
+        pdf_path (str): Path to PDF file to delete
+    """
+    try:
+        if os.path.exists(pdf_path):
+            os.remove(pdf_path)
+            logger.info(f"Temporary PDF file deleted: {pdf_path}")
+    except Exception as e:
+        logger.warning(f"Unable to delete temporary PDF file {pdf_path}: {str(e)}")

utils/text_extraction.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Module for extracting text from different file formats.
+Supports: TXT, PDF, DOCX
+"""
+import os
+import logging
+from typing import Optional
+try:
+    import PyPDF2
+except ImportError:
+    PyPDF2 = None
+try:
+    from docx import Document
+except ImportError:
+    Document = None
+# Configurazione logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def extract_text(file_path: str) -> Optional[str]:
+    """
+    Extract text from a supported file.
+    Args:
+        file_path (str): Path to file to process
+    Returns:
+        Optional[str]: Extracted text or None if error
+    """
+    if not os.path.exists(file_path):
+        logger.error(f"File not found: {file_path}")
+        return None
+    file_extension = os.path.splitext(file_path)[1].lower()
+    try:
+        if file_extension == '.txt':
+            return _extract_from_txt(file_path)
+        elif file_extension == '.pdf':
+            return _extract_from_pdf(file_path)
+        elif file_extension == '.docx':
+            return _extract_from_docx(file_path)
+        else:
+            logger.error(f"Unsupported file format: {file_extension}")
+            return None
+    except Exception as e:
+        logger.error(f"Error extracting text from {file_path}: {str(e)}")
+        return None
+def _extract_from_txt(file_path: str) -> str:
+    """Extract text from TXT file."""
+    encodings = ['utf-8', 'latin-1', 'cp1252']
+    for encoding in encodings:
+        try:
+            with open(file_path, 'r', encoding=encoding) as file:
+                return file.read()
+        except UnicodeDecodeError:
+            continue
+    # If all encodings fail, try with error handling
+    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
+        return file.read()
+def _extract_from_pdf(file_path: str) -> str:
+    """Extract text from PDF file."""
+    if PyPDF2 is None:
+        raise ImportError("PyPDF2 not installed. Install with: pip install pypdf2")
+    text = ""
+    with open(file_path, 'rb') as file:
+        pdf_reader = PyPDF2.PdfReader(file)
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+            text += page.extract_text() + "\n"
+    return text.strip()
+def _extract_from_docx(file_path: str) -> str:
+    """Extract text from DOCX file."""
+    if Document is None:
+        raise ImportError("python-docx not installed. Install with: pip install python-docx")
+    doc = Document(file_path)
+    text = ""
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + "\n"
+    return text.strip()
+def get_supported_extensions() -> list:
+    """Return supported file extensions."""
+    return ['.txt', '.pdf', '.docx']

utils/transcription.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+Module for transcribing audio files using Whisper.
+Optimized for CPU with whisper-tiny model.
+"""
+import os
+import logging
+from typing import Optional
+try:
+    import torch
+    from transformers import WhisperProcessor, WhisperForConditionalGeneration
+    import librosa
+except ImportError as e:
+    print(f"Import error: {e}")
+    torch = None
+    WhisperProcessor = None
+    WhisperForConditionalGeneration = None
+    librosa = None
+# Configurazione logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Variabili globali per il modello (caricato una sola volta)
+_model = None
+_processor = None
+def load_whisper_model():
+    """Load Whisper tiny model optimized for CPU."""
+    global _model, _processor
+    if _model is None or _processor is None:
+        try:
+            logger.info("Loading Whisper tiny model...")
+            # Load processor and model
+            _processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+            _model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+            # Configure for CPU
+            _model.eval()
+            if torch.cuda.is_available():
+                _model = _model.to("cuda")
+            else:
+                _model = _model.to("cpu")
+            logger.info("Whisper model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading Whisper model: {str(e)}")
+            raise
+    return _model, _processor
+def transcribe_audio(file_path: str, language: str = "en") -> Optional[str]:
+    """
+    Transcribe an audio file using Whisper.
+    Args:
+        file_path (str): Path to audio file
+        language (str): Language of audio content (default: "en" for English)
+    Returns:
+        Optional[str]: Text transcription or None if error
+    """
+    if not os.path.exists(file_path):
+        logger.error(f"Audio file not found: {file_path}")
+        return None
+    if librosa is None:
+        logger.error("librosa not installed. Install with: pip install librosa")
+        return None
+    try:
+        # Load the model
+        model, processor = load_whisper_model()
+        # Load and preprocess audio
+        logger.info(f"Loading audio file: {file_path}")
+        audio_array, sample_rate = librosa.load(file_path, sr=16000)
+        # Preprocess audio
+        inputs = processor(audio_array, sampling_rate=sample_rate, return_tensors="pt")
+        # Move to appropriate device
+        device = next(model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate transcription
+        logger.info("Generating transcription...")
+        with torch.no_grad():
+            predicted_ids = model.generate(
+                inputs["input_features"],
+                max_length=448,
+                num_beams=1,
+                do_sample=False,
+                language=language
+            )
+        # Decode the result
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        logger.info("Transcription completed successfully")
+        return transcription.strip()
+    except Exception as e:
+        logger.error(f"Error during transcription of {file_path}: {str(e)}")
+        return None
+def get_supported_audio_extensions() -> list:
+    """Return supported audio extensions."""
+    return ['.mp3', '.wav', '.m4a', '.flac', '.ogg']
+def is_audio_file(file_path: str) -> bool:
+    """Check if a file is a supported audio file."""
+    if not file_path:
+        return False
+    file_extension = os.path.splitext(file_path)[1].lower()
+    return file_extension in get_supported_audio_extensions()