Spaces:

samir72
/

AudioChatTranscriber

Running

App Files Files Community

GitHub Actions commited on 12 days ago

Commit

92ddce4

0 Parent(s):

Clean sync from GitHub - no large files in history

Browse files

Files changed (34) hide show

.DS_Store +0 -0
.github/workflows/main.yml +45 -0
.gitignore +5 -0
FoundationCode.py +167 -0
LICENSE +21 -0
README.md +261 -0
Youtubetranscription_summarizer.py +224 -0
__pycache__/Youtubetranscription_summarizer.cpython-313.pyc +0 -0
__pycache__/app.cpython-313.pyc +0 -0
app.py +352 -0
app_v1.py +208 -0
extract/.DS_Store +0 -0
extract/Dockerfile +41 -0
extract/__init__.py +0 -0
extract/__pycache__/__init__.cpython-313.pyc +0 -0
extract/app/.DS_Store +0 -0
extract/app/Youtubeextraction.py +169 -0
extract/app/__init__.py +0 -0
extract/app/__pycache__/Youtubeextraction.cpython-313.pyc +0 -0
extract/app/__pycache__/__init__.cpython-313.pyc +0 -0
extract/app/requirements.txt +7 -0
extract/utils/__init__.py +0 -0
extract/utils/__pycache__/__init__.cpython-313.pyc +0 -0
extract/utils/__pycache__/cookies_refresher.cpython-313.pyc +0 -0
extract/utils/__pycache__/retrieve_filepath.cpython-313.pyc +0 -0
extract/utils/__pycache__/storage.cpython-313.pyc +0 -0
extract/utils/cookies_refresher.py +56 -0
extract/utils/probeytdlp.py +54 -0
extract/utils/retrieve_filepath.py +10 -0
extract/utils/storage.py +42 -0
gradio_client_audichattranscriber.py +35 -0
metadata.json +46 -0
packages.txt +1 -0
requirements.txt +12 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,45 @@

+name: Sync to Hugging Face Space
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+jobs:
+  sync-to-space:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1  # Shallow clone to avoid large files in history
+          lfs: false  # Don't fetch LFS files since we don't use them
+      - name: Push to Hugging Face Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          # Fail loudly and show each command
+          set -euxo pipefail
+          # Configure git
+          git config --global user.email "actions@github.com"
+          git config --global user.name "GitHub Actions"
+          git config --global credential.helper ""
+          export GIT_TERMINAL_PROMPT=0
+          echo "Current branch:"
+          git branch --show-current || true
+          echo "Git remotes:"
+          git remote -v
+          # Add/replace remote with token auth (note 'user' here)
+          git remote remove hf 2>/dev/null || true
+          git remote add hf "https://user:${HF_TOKEN}@huggingface.co/spaces/samir72/AudioChatTranscriber"
+          echo "Testing authentication with git ls-remote..."
+          git ls-remote hf
+          echo "Creating fresh orphan branch without history..."
+          # Create a new branch with only current state (no history with large files)
+          git checkout --orphan temp-clean-branch
+          git add -A
+          git commit -m "Clean sync from GitHub - no large files in history"
+          echo "Force pushing clean branch to HF Space..."
+          git push --force hf temp-clean-branch:main

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+bin
+obj
+*.sln
+.env*
+venv/

FoundationCode.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Add references
+from azure.identity import DefaultAzureCredential
+from azure.ai.projects import AIProjectClient
+import gradio as gr
+from dotenv import load_dotenv
+import requests
+import os
+import tempfile
+import base64
+# Placeholder for your summarization function.
+# Replace this with your actual function that takes a WAV file path and returns the summary.
+def summarize_audio(audio_data,sysprompt,userprompt):
+    # Code to summarize the audio file using LLM and Azure OpenAI
+    try:
+            # Get configuration settings
+            load_dotenv()
+            project_endpoint = os.getenv("AC_PROJECT_ENDPOINT")
+            model_deployment =  os.getenv("AC_MODEL_DEPLOYMENT")
+            # Initialize the project client
+            project_client = AIProjectClient(
+                credential=DefaultAzureCredential(
+                    exclude_environment_credential=True,
+                    exclude_managed_identity_credential=True
+                ),
+                endpoint=project_endpoint,
+            )
+            # Get a chat client
+            openai_client = project_client.get_openai_client(api_version="2024-10-21")
+            # Initialize prompts
+            if sysprompt:
+                system_message = sysprompt
+            else:
+                system_message = "You are an AI assistant with a charter to clearly analyse the customer enquiry."
+            prompt = ""
+            # Loop until the user types 'quit'
+            while True:
+                #prompt = input("\nAsk a question about the audio\n(or type 'quit' to exit)\n")
+                if userprompt:
+                    prompt = userprompt
+                else:
+                    prompt = "quit"
+                if prompt.lower() == "quit":
+                    break
+                elif len(prompt) == 0:
+                        print("Please enter a question.\n")
+                else:
+                    print("Getting a response ...\n")
+                    # Encode the audio file
+                    #audio_data = encode_audio(wav_path)
+                    # Get a response to audio input
+                    response = openai_client.chat.completions.create(
+                        model=model_deployment,
+                        messages=[
+                            {"role": "system", "content": system_message},
+                            { "role": "user",
+                                "content": [
+                                {
+                                    "type": "text",
+                                    "text": prompt
+                                },
+                                {
+                                    "type": "input_audio",
+                                    "input_audio": {
+                                        "data": audio_data,
+                                        "format": "mp3"
+                                    }
+                                }
+                            ] }
+                        ]
+                    )
+                    print(response.choices[0].message.content)
+                    userprompt = ""
+    except Exception as ex:
+            print(ex)
+    return response.choices[0].message.content
+def encode_audio(audio_file,action):
+        """Encode audio files in the specified folder to base64."""
+        try:
+                if action == "Read":
+                    with open(audio_file, 'rb') as audio_file:
+                        audio_data = base64.b64encode(audio_file.read()).decode('utf-8')
+                    return audio_data
+                elif action == "Download":
+                     audio_data = base64.b64encode(audio_file).decode('utf-8')
+                     return audio_data
+        except Exception as e:
+            raise ValueError(f"Failed to encode audio file: {str(e)}")
+def download_wav_from_url(url):
+    if not url:
+        return None
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        return response.content
+    except Exception as e:
+        raise ValueError(f"Failed to download WAV from URL: {str(e)}")
+def process_audio(upload_audio, record_audio, url,sysprompt,userprompt):
+    wav_path = None
+    temp_files = []  # To clean up temp files later if needed
+    if upload_audio:
+        wav_path = upload_audio
+        audio_data = encode_audio(wav_path,"Read")
+    elif record_audio:
+        wav_path = record_audio
+        audio_data = encode_audio(wav_path,"Read")
+    elif url:
+        wav_path = download_wav_from_url(url)
+        audio_data = encode_audio(wav_path,"Download")
+        if audio_data:
+            temp_files.append(audio_data)
+    if not wav_path:
+        return "Please provide an audio file via upload, recording, or URL."
+    try:
+        summary = summarize_audio(audio_data,sysprompt,userprompt)
+        return summary
+    finally:
+        # Optional: Clean up temp files
+        for temp in temp_files:
+            if os.path.exists(temp):
+                os.remove(temp)
+with gr.Blocks(title="Audio Summarizer UI") as demo:
+    gr.Markdown("# Audio File Summarizer")
+    gr.Markdown("Upload a WAV file, record audio, or provide a URL to a WAV file for summarization.")
+    with gr.Row():
+        with gr.Column():
+            upload_audio = gr.Audio(sources="upload", type="filepath", label="Upload WAV File")
+        with gr.Column():
+            record_audio = gr.Audio(sources="microphone", type="filepath", label="Record Audio")
+        with gr.Column():
+            url_input = gr.Textbox(label="Enter URL to WAV File", placeholder="https://example.com/audio.wav")
+        with gr.Column():
+            userprompt_input = gr.Textbox(label="Enter User Prompt", placeholder="Ask a question about the audio",value="Summarize the audio content")
+        with gr.Column():
+            sysprompt_input = gr.Textbox(label="Enter System Prompt",value="You are an AI assistant with a listening charter to clearly analyse the customer enquiry.")
+    submit_btn = gr.Button("Summarize")
+    output = gr.Textbox(label="Summary", lines=10)
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[upload_audio, record_audio, url_input,sysprompt_input,userprompt_input],
+        outputs=output
+    )
+demo.launch()

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Sayed A Rizvi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,261 @@

+---
+title: AudioSummarizer
+emoji: 📚
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 5.46.1
+app_file: app.py
+pinned: false
+license: mit
+---
+# AudioSummarizer
+## What’s New (May 3, 2026)
+- **Gradio upgraded to 5.46.1** to resolve a pip dependency conflict between `gradio==5.45.0` and `gradio==5.46.1`.
+- **GitHub Actions CI/CD workflow added** — every push to `main` automatically syncs the repo to the Hugging Face Space (`samir72/AudioChatTranscriber`). Requires an `HF_TOKEN` secret configured in the GitHub repository settings.
+- **Model switched to gpt-4o-mini** — replaced Phi-4-multimodal-instruct with `gpt-4o-mini` via Azure OpenAI to resolve `DeploymentNotFound` errors. Set `AC_MODEL_DEPLOYMENT=gpt-4o-mini` in your environment or HF Space secrets.
+- **Audio pipeline updated for gpt-4o-mini compatibility** — `gpt-4o-mini` does not support native audio content blocks. All audio inputs (upload, microphone, direct MP3 URL) are now transcribed locally via **faster-whisper** before being sent to the model as text, eliminating the `400 invalid_request_error`.
+---
+## What’s New (Sep 26–28, 2025)
+- **YouTube cookie refresh & expiry handling** added to avoid sign-in/download failures.
+- **DNS lookup improvements**: automatically skip DNS failures on Hugging Face Spaces to reduce false negatives.
+- **Azure Container App (ACA) integration**: bypasses YouTube blocking by offloading audio download to Azure, storing audio in Blob Storage, and feeding it into the HF pipeline.
+- **Docker / ACA enhancements**: uses Microsoft slim base image in ACR for faster builds, with trade-off that the base must be regularly refreshed.
+- **Repo restructuring**: renamed the app entry folder to `extract/` to resolve a Hugging Face build conflict.
+---
+## Overview
+AudioSummarizer is a web app (deployed on Hugging Face Spaces) that summarizes audio from multiple sources — file upload, microphone, or URL (YouTube / direct MP3) — using **gpt-4o-mini** via Azure OpenAI for structured summarization. The app uses **faster‑whisper** for transcription and **yt-dlp** + **ffmpeg** for audio extraction, with a clean **Gradio** UI. Prompts are loaded from `metadata.json` to ensure replies include **Summary**, **Key Details**, and **Insights**.
+Because Hugging Face often cannot directly fetch YouTube audio (due to network restrictions or blocking), we now route YouTube downloads through an **Azure Container App** which:
+1. Fetches the YouTube audio independently.
+2. Stores the processed 16 kHz mono WAV file in **Azure Blob Storage**.
+3. Serves that file into the usual transcription/summarization pipeline in the HF app.
+Thus, the HF interface remains unchanged to users, but YouTube support is restored reliably via Azure.
+---
+## Features
+- Upload a local MP3 file, record via microphone, or enter a YouTube / MP3 URL.
+- **Azure Container App support** so YouTube content is reliably processed even if Hugging Face cannot fetch it.
+- Prompts fully customizable: you may define system and user prompts stored in `metadata.json`.
+- Transcription using **faster-whisper**, summarization through **gpt-4o-mini** (Azure OpenAI).
+- Clean and minimal **Gradio** UI for intuitive interaction.
+- Configuration via environment variables (`.env`) for Azure endpoint, deployment name, API key, etc.
+- YouTube audio extraction to **16 kHz mono WAV** (via yt-dlp + ffmpeg).
+- DNS‑based URL validation, with automatic skip of DNS errors in HF Spaces to reduce false rejections.
+---
+## Architecture / Data Flow
+```
+User Input (YouTube) ──▶ Hugging Face UI
+   │
+   └── If URL is YouTube:
+         ─▶ forwarded to Azure Container App
+               ├── ACA downloads YouTube audio (yt-dlp)
+               └── Converts/stores WAV in Azure Blob Storage
+         ─▶ HF app fetches WAV from Blob Storage
+               ├── Transcribe via faster-whisper
+               └── Summarize via Azure gpt-4o-mini
+ ┌───────────────┐ file/mic/url ┌───────────────────────────┐
+ │   Gradio UI   │─────────────▶│ process_audio(...)         │
+ └──────┬────────┘              └──────────┬─────────────────┘
+        │ validates/reads                  │
+        ▼                                  ▼
+ ┌───────────────────────────┐   ┌─────────────────────────────┐
+ │ summarize_input(audio,...)│──▶│ Azure gpt-4o-mini │
+ └───────────────────────────┘   │ Chat Completions (text+audio)│
+                                 └─────────────────────────��───┘
+ YouTube Path (via ACA):
+ ┌───────────────┐  YouTube URL ┌──────────────────────────────┐
+ │   Gradio UI   │────────────▶ │ Azure Container App (yt-dlp)  │
+ └───────────────┘              └──────────┬───────────────────┘
+                                           │ uploads audio
+                                           ▼
+                                ┌──────────────────────────────┐
+                                │ Azure Blob Storage (WAV 16k) │
+                                └──────────┬───────────────────┘
+                                           │
+                                           ▼
+                              ┌──────────────────────────────┐
+                              │ faster-whisper transcription │
+                              └──────────┬───────────────────┘
+                                           │ text
+                                           ▼
+                              ┌──────────────────────────────┐
+                              │ Azure gpt-4o-mini │
+                              │ summarization                │
+                              └──────────────────────────────┘
+```
+For non-YouTube inputs (local upload, mic, direct MP3 URL), the flow remains internal to the HF space: download/convert → transcription → summarization.
+---
+## CI/CD — GitHub Actions
+A workflow at `.github/workflows/main.yml` runs on every push to `main` (and can be triggered manually via `workflow_dispatch`).
+**What it does:**
+1. Checks out the repo with a shallow clone (no LFS, no full history).
+2. Creates a clean orphan branch — only the current file state, no large-file history.
+3. Force-pushes that branch to the `main` branch of the Hugging Face Space `samir72/AudioChatTranscriber`.
+**Setup requirement:** Add an `HF_TOKEN` secret in **GitHub → Settings → Secrets and variables → Actions** with a Hugging Face token that has write access to the Space.
+---
+## Docker & Azure Container Apps
+### Optimization: Microsoft Slim Base in ACR
+The Docker image now uses a **Microsoft slim base image** hosted in **Azure Container Registry (ACR)** to speed up builds (less reliance on external pulls).
+- ✅ **Advantage**: faster, more predictable builds in Azure / CI.
+- ⚠️ **Caveat**: you must **refresh the slim base in ACR routinely** to catch upstream security patches, updates, or bug fixes.
+**Best Practice Recommendation:**
+Set up a scheduled job (e.g. via ACR Task or Azure DevOps pipeline) to pull the latest Microsoft slim base and update your ACR copy on a regular cadence (e.g. weekly) so your deployed containers remain current.
+### Build & Run Example
+```bash
+# Build locally
+docker build -t audiosummarizer:latest .
+# Run container
+docker run --rm -p 7860:7860   -e AC_OPENAI_ENDPOINT=...   -e AC_MODEL_DEPLOYMENT=...   -e AC_OPENAI_API_KEY=...   -e AC_OPENAI_API_VERSION=...   audiosummarizer:latest
+```
+For ACA deployment:
+1. Push the Docker image to your ACR.
+2. Deploy the image via **Azure Container Apps** with necessary environment variables.
+3. The ACA will serve as the YouTube‐to‑Blob “fetcher” component, supporting the main HF app.
+---
+## Prerequisites
+- Python **3.10+**
+- Azure subscription with deployment of **gpt-4o-mini**
+- `ffmpeg` installed and in `$PATH`
+- A valid `metadata.json` containing default prompts
+- For HF spaces: `packages.txt` including `ffmpeg`
+---
+## Python Dependencies
+Add to `requirements.txt`:
+```
+azure-identity>=1.17.1
+openai>=1.0.0
+gradio>=4.44.0
+python-dotenv>=1.0.1
+requests>=2.32.3
+yt-dlp>=2024.8.6
+faster-whisper>=0.10.0
+beautifulsoup4>=4.12.2   # optional, for fallback scraping
+```
+Install as usual:
+```bash
+python -m venv .venv
+source .venv/bin/activate  # on Windows: .venv\Scripts\activate
+pip install -r requirements.txt
+```
+---
+## Installation
+```bash
+git clone https://github.com/samir72/AudioSummarizer.git
+cd AudioSummarizer
+```
+Install dependencies and make sure `ffmpeg` is available (or included via `packages.txt` in HF deployment).
+---
+## Configuration
+Create a `.env` file at the project root:
+```env
+AC_OPENAI_ENDPOINT=https://<your-azure-resource>.openai.azure.com/
+AC_MODEL_DEPLOYMENT=<your‑phi‑4 deployment name>
+AC_OPENAI_API_KEY=<your azure openai api key>
+AC_OPENAI_API_VERSION=<api version e.g. 2024-10-01>
+GRADIO_SERVER_NAME=127.0.0.1
+GRADIO_SERVER_PORT=7860
+```
+If you’re running the Azure Container App, ensure it is configured with:
+- Proper role / access to write to Azure Blob Storage
+- Environment variables for any keys or connection strings it needs
+- Networking/firewall settings so the HF app can fetch from the blob store
+---
+## Usage
+Run the app:
+```bash
+python app.py
+```
+Then open your browser to [http://127.0.0.1:7860](http://127.0.0.1:7860) or use your HF Space URL.
+### Input options
+- Upload MP3 file
+- Record via microphone
+- Enter a YouTube / direct MP3 URL
+- Modify system/user prompts (via `metadata.json`)
+- Click **Summarize** → get structured output (Summary, Key Details, Insights)
+---
+## Contributing
+We welcome your improvements—especially around cloud integration, performance, and reliability.
+**Suggested contribution areas:**
+- Better error handling for cookie expiry, fallback strategies
+- Enhancements to the Azure Container App + Blob Storage pipeline
+- Caching / sync between ACA and the HF app
+- Automation of **ACR slim base refresh**
+**How to contribute:**
+1. Fork the repository
+2. Create a feature branch (e.g. `git checkout -b feat/xyz`)
+3. Commit changes with meaningful messages
+4. Push and open a Pull Request
+Please reference this `README.md` when describing how the YouTube → ACA → Blob → HF flow works.
+---
+## License
+This project is licensed under the **MIT License** — see [LICENSE](./LICENSE) for details.
+---
+## Acknowledgments
+- Built with **Gradio** for UI
+- Application deployed on **Hugging Face Spaces**
+- ACA deployed on **Azure**
+- Application layer on ACA served by **FastAPI**
+- Intelligence by **Azure gpt-4o-mini**
+- YouTube audio extraction with **yt-dlp**
+- Transcription enabled by **faster-whisper**
+---
+## Contact
+For questions or feedback, reach out to **Sayed Amir Rizvi**
+Email: syedamirhusain@gmail.com

Youtubetranscription_summarizer.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os, tempfile, subprocess, json, re, time, shutil
+from pathlib import Path
+from typing import Optional, Callable, Any
+import yt_dlp
+from faster_whisper import WhisperModel
+import socket
+def main(url:str):
+    # Get YouTube URL from user
+    ensure_ffmpeg()
+    url = get_video_id(url)
+    #Pass the URL to download audio and convert to wav
+    wav_path = download_youtube_audio_wav16k_api(url)
+    #Transcribe the audio wav file
+    transcript = transcribe_faster_whisper(wav_path, model_name="base.en")
+    #print(f"Transcription completed. Language: {transcript['language']}")
+    #print(json.dumps(transcript, indent=2))
+    #Summarize the transcript using Phi
+    return transcript
+def nslookup(domain):
+    try:
+        # Perform DNS lookup for the domain
+        addresses = socket.getaddrinfo(domain, None)
+        print(f"DNS lookup succesfull for {domain}:")
+        return True
+        # for addr in addresses:
+        #     # Extract IP address from the result
+        #     ip = addr[4][0]
+        #     print(f"IP Address: {ip}")
+    except socket.gaierror as e:
+        print(f"DNS lookup failed for {domain}: {e}")
+        return True # Assume true as youtube DNS will fail on huggingface
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False
+def extract_domain(url):
+    # Regular expression to match the domain name
+    # Matches http:// or https://, followed by the domain (e.g., audio-samples.github.io)
+    pattern = r'https?://([a-zA-Z0-9.-]+)'
+    match = re.search(pattern, url)
+    if match:
+        return match.group(1)
+    else:
+        return None
+def get_video_id(url:str)->str:
+    # Extract video ID from various YouTube URL formats
+    m = re.search(r"(?:v=|/shorts/|/live/|/embed/)([A-Za-z0-9_-]{6,})", url)
+    return m.group(1) if m else str(abs(hash(url)))
+def ensure_ffmpeg():
+    """
+    Verify that ffmpeg is available in PATH.
+    Raises RuntimeError with helpful guidance if missing.
+    Prints ffmpeg version to logs if found.
+    """
+    ffmpeg_path = shutil.which("ffmpeg")
+    if ffmpeg_path is None:
+        raise RuntimeError(
+            "FFmpeg not found in PATH.\n\n"
+            "👉 For Hugging Face Spaces:\n"
+            "   • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
+            "   • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
+            "Without ffmpeg, yt-dlp cannot extract/convert audio."
+        )
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-version"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            check=False,
+        )
+        print("✅ ffmpeg found at:", ffmpeg_path)
+        print(result.stdout.splitlines()[0])  # show first line of version info
+    except Exception as e:
+        raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")
+class YTDLPError(RuntimeError):
+    pass
+def _require(bin_name: str):
+    if shutil.which(bin_name) is None:
+        raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")
+def download_youtube_audio_wav16k_api(
+    youtube_url: str,
+    out_dir: Optional[str] = None,
+    target_sr: int = 16000,
+    target_channels: int = 1,
+    quiet: bool = True,
+    keep_intermediate: bool = False,
+    progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
+) -> str:
+    """
+    Download YouTube audio via yt_dlp's Python API, extract to WAV,
+    and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.
+    Args
+    ----
+    youtube_url : str
+    out_dir : Optional[str]    Directory for outputs (temp dir if None).
+    target_sr : int            Sample rate for final WAV (default 16000).
+    target_channels : int      Channels for final WAV (default 1 = mono).
+    quiet : bool               Suppress yt-dlp logs if True.
+    keep_intermediate : bool   Keep the pre-downsampled WAV if True.
+    progress_hook : callable   Optional yt-dlp progress hook.
+    Raises
+    ------
+    YTDLPError on failure.
+    """
+    if not youtube_url or not isinstance(youtube_url, str):
+        raise ValueError("youtube_url must be a non-empty string.")
+    _require("ffmpeg")  # we call ffmpeg ourselves
+    # yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly
+    work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
+    work_dir.mkdir(parents=True, exist_ok=True)
+    # First stage: let yt-dlp extract WAV (whatever SR/channels)
+    out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
+    hooks = [progress_hook] if progress_hook else []
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "outtmpl": out_template,
+        "noplaylist": True,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": "0",
+            }
+        ],
+        "quiet": quiet,
+        "no_warnings": quiet,
+        "progress_hooks": hooks,
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.extract_info(youtube_url, download=True)
+    except Exception as e:
+        #raise YTDLPError(f"yt-dlp API failed: {e}") from e
+        return f"yt-dlp API failed: {e}"
+    # Locate the produced WAV (pre-downsampled)
+    pre_wavs = list(work_dir.glob("*.wav"))
+    if not pre_wavs:
+        #raise YTDLPError("yt-dlp completed but no WAV was found.")
+        return "yt-dlp completed but no WAV was found."
+    pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)
+    # Second stage: force 16 kHz mono via ffmpeg
+    final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
+    try:
+        subprocess.run(
+            [
+                "ffmpeg", "-y",
+                "-i", str(pre_wav),
+                "-ac", str(target_channels),
+                "-ar", str(target_sr),
+                str(final_wav),
+            ],
+            check=True,
+            stdout=subprocess.PIPE if quiet else None,
+            stderr=subprocess.PIPE if quiet else None,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        #raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
+        return f"ffmpeg failed to resample: {e.stderr or e.stdout}"
+    # Clean up intermediates if desired
+    if not keep_intermediate:
+        try:
+            if pre_wav.exists() and pre_wav != final_wav:
+                pre_wav.unlink()
+        except Exception:
+            pass
+    return str(final_wav)
+def transcribe_faster_whisper(wav_path:str, model_name="base.en"):
+    try:
+        model = WhisperModel(model_name)
+        segments, info = model.transcribe(wav_path, beam_size=1, vad_filter=True)
+        out = []
+        for s in segments:
+            out.append({"start": s.start, "end": s.end, "text": s.text})
+        #return {"language": info.language, "segments": out}
+        return {"segments": out}
+    except Exception as e:
+        return f"Faster-Whisper transcription failed: {e}"
+def summarize_with_phi(transcript_segments, sysprompt, userprompt, phi_client):
+    # map-reduce pseudo:
+    CHUNK_SEC = 600  # ~10min per chunk as a starting point
+    chunks, cur, cur_t = [], [], 0.0
+    for seg in transcript_segments:
+        cur.append(seg); cur_t += (seg["end"]-seg["start"])
+        if cur_t >= CHUNK_SEC:
+            chunks.append(cur); cur, cur_t = [], 0.0
+    if cur: chunks.append(cur)
+    partials = []
+    for idx, chunk in enumerate(chunks, 1):
+        text = "\n".join(f"[{int(s['start']//60):02d}:{int(s['start']%60):02d}] {s['text']}" for s in chunk)
+        prompt = f"{userprompt}\n\nTRANSCRIPT CHUNK {idx}:\n{text}\n\nReturn: bullet summary + key timestamps."
+        partials.append(phi_client.summarize(sysprompt, prompt))  # your existing call
+    merged_prompt = f"Merge the {len(partials)} chunk summaries into one concise summary + top 5 timestamps."
+    return phi_client.summarize(sysprompt, merged_prompt + "\n\n" + "\n\n".join(partials))
+if __name__ == "__main__":
+    main(url=None)  # for local testing

__pycache__/Youtubetranscription_summarizer.cpython-313.pyc ADDED Viewed

Binary file (10.4 kB). View file

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (14.1 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import base64
+import tempfile
+import requests
+from datetime import datetime
+import gradio as gr
+from dotenv import load_dotenv
+from openai import AzureOpenAI  # official OpenAI SDK, works with Azure endpoints
+import json
+import subprocess
+import Youtubetranscription_summarizer
+from extract.app.Youtubeextraction import extract  # Youtube download helper functions
+#from pydantic import BaseModel, AnyUrl # Pydantic models for request validation in yiutube extraction
+#from fastapi import FastAPI, HTTPException # FastAPI for building the API
+#app = FastAPI() ## Initialize FastAPI app for testing in local
+#from extractor.app.storage import upload_and_sign  # Youtube storage helper functions
+import re
+# --- LLM call (Azure OpenAI with API key) -----------------------------------
+def summarize_input(audio_b64: str = None, text_input: str = None, sys_prompt: str = None, user_prompt: str = None, Starttime: datetime = None) -> str:
+    """
+    Calls Azure OpenAI Chat Completions with audio input (base64 mp3) or text input, or both.
+    """
+    load_dotenv()
+    endpoint = os.getenv("AC_OPENAI_ENDPOINT")
+    api_key = os.getenv("AC_OPENAI_API_KEY")
+    deployment = os.getenv("AC_MODEL_DEPLOYMENT")
+    api_version = os.getenv("AC_OPENAI_API_VERSION")
+    if not endpoint or not api_key or not deployment:
+        return "Server misconfiguration: required env vars missing."
+    # Reset json_text for logging
+    json_text = ""
+    try:
+        client = AzureOpenAI(
+            api_key=api_key,
+            api_version=api_version,
+            azure_endpoint=endpoint,
+        )
+        system_message = sys_prompt.strip() if sys_prompt else (
+            "You are an AI assistant with a charter to clearly analyze the customer enquiry."
+        )
+        user_text = user_prompt.strip() if user_prompt else (
+            "Summarize the provided content." if audio_b64 or text_input else "No input provided."
+        )
+        content = [{"type": "text", "text": user_text}]
+        if audio_b64:
+            content.append({
+                "type": "input_audio",
+                "input_audio": {"data": audio_b64, "format": "mp3"},
+            })
+        if text_input is not None:
+            # Debugging: Print the type and value of text_input
+            #print(f"Debug: text_input type={type(text_input)}, value={text_input}")
+            if isinstance(text_input, str):
+                try:
+                    # Try to parse the string as JSON to see if it's a list or dict
+                    parsed = json.loads(text_input)
+                    if isinstance(parsed, (list, dict)):
+                        # If it's a list or dict, convert back to JSON string
+                        content.append({"type": "text", "text": json.dumps(parsed)})
+                    else:
+                        # If it's a string but not a JSON list/dict, use it as-is
+                        content.append({"type": "text", "text": text_input})
+                except json.JSONDecodeError:
+                    # If it's not valid JSON, treat it as a regular string
+                    content.append({"type": "text", "text": text_input})
+            elif isinstance(text_input, (list, dict)):
+                try:
+                    # Convert list or dict to JSON-formatted string
+                    json_text = json.dumps(text_input)
+                    content.append({"type": "text", "text": json_text})
+                except (TypeError, ValueError):
+                    return "Error: text_input (list or dict) could not be converted to JSON."
+            else:
+                return f"Error: text_input must be a string, list, or dict, got {type(text_input)}."
+        response = client.chat.completions.create(
+            model=deployment,
+            messages=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": content},
+            ],
+        )
+        Enddate = datetime.now()
+        Callduration = Enddate - Starttime[0]
+        print(f"AudioChatSummarizer API call with a duration of {Callduration}: prompt_length={len(user_prompt or '')}, "
+              f"audio_size={len(audio_b64 or '')}, text_input_size={len(json_text or '')}")
+        return response.choices[0].message.content
+    except Exception as ex:
+        return print(f"Error from Azure OpenAI: {ex}")
+#----Retrieve meta data from metadata.json file------------------------------
+def retrieve_file_path(file_name):
+    path = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(path, file_name)
+    if os.path.isfile(file_path):
+        return file_path
+    elif not os.path.exists(file_path):
+        print(f"'{file_path}' does not exist.")
+        return None
+    return None
+def retrieve_json_record(file_path, record_id):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+        if isinstance(data, list):
+            for record in data:
+                if record.get('metadata', {}).get('id') == record_id:
+                    return record
+        elif isinstance(data, dict):
+            if data.get('metadata', {}).get('id') == record_id:
+                return data
+    return None
+# --- I/O helpers ------------------------------------------------------------
+def encode_audio_from_path(path: str) -> str:
+    with open(path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+def download_to_temp_mp3(url: str) -> str:
+    r = requests.get(url, stream=True, timeout=30)
+    r.raise_for_status()
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
+        for chunk in r.iter_content(chunk_size=8192):
+            if chunk:
+                tmp.write(chunk)
+        return tmp.name
+# function to read files
+def file_read(filepath):
+    file_data = []
+    try:
+        with open(filepath, "rb") as f:
+            file_data = f.read()
+            print(f"Successfully validated {file_path} and read {len(file_data)} bytes.")
+    except Exception as e:
+                print(f"Could not read {file_path}: {e}")
+    return file_data
+###Download youtube video and extract audio using yt-dlp and ffmpeg
+#### Fixing code to resolve 404 error
+def fetch_audio_from_youtube(youtube_url: str) -> str:
+    """
+    Calls the extractor service and returns the signed audio URL.
+    - Tries POST /extract with youtube_url as a query param (your current server shape).
+    - Falls back to sending youtube_url in JSON body if needed.
+    - Accepts either JSON {"audio_url": "..."} or a plain string URL.
+    """
+    EXTRACT_API = os.getenv("AZURE_CONTAINER_APP_FQDN") ## Fast API endpoint for youtube extraction "https://<your-app-fqdn>/extract"
+    print(f"Extract_API value: {EXTRACT_API}")
+    base = EXTRACT_API.rstrip("/")
+    endpoint = base if base.endswith("/extract") else f"{base}/extract"
+    payload = {"format": "wav", "sample_rate": 16000, "mono": True}
+    timeout = 90
+    try:
+        # 1) Preferred: youtube_url as QUERY PARAM (matches your current API)
+        r = requests.post(endpoint, params={"youtube_url": youtube_url},
+                          json=payload, timeout=timeout)
+        if r.status_code == 404 or r.status_code == 422:
+            # 2) Fallback: youtube_url in JSON body (if your API switches later)
+            body = {"youtube_url": youtube_url, **payload}
+            r = requests.post(endpoint, json=body, timeout=timeout)
+        if r.status_code >= 400:
+            # log details instead of raising blindly
+            print("STATUS:", r.status_code)
+            print("HEADERS:", r.headers)
+            print("BODY:", r.text[:2000])
+            r.raise_for_status()
+        # Response parsing: support dict or plain string
+        ctype = r.headers.get("Content-Type", "")
+        if "application/json" in ctype:
+            data = r.json()
+            # If server validates response_model to dict
+            if isinstance(data, dict) and "audio_url" in data:
+                return data["audio_url"]
+            # If server returns plain string in JSON (rare)
+            if isinstance(data, str):
+                return data
+            raise ValueError(f"Unexpected JSON shape: {data}")
+        else:
+            # Plain text URL response_model=str
+            text = r.text.strip()
+            if text.startswith("http"):
+                return text
+            raise ValueError(f"Unexpected text response: {text[:200]}")
+    except Exception as e:
+        msg = (f"{datetime.now()}: Error retrieving youtube wave file from Azure instance. "
+               f"url={youtube_url} endpoint={endpoint} err={e}")
+        print(msg)
+        return msg
+def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
+    tmp_to_cleanup = []
+    text_input = None
+    domaincheck = None
+    extract_input = None
+    audio_wav = None
+    try:
+        # Capture start time for logging
+        Starttime = datetime.now(),
+        print(f"AudioChatSummarizer API call starts at {datetime.now()}"),
+        audio_path = None
+        if upload_path:
+            audio_path = upload_path
+        elif record_path:
+            audio_path = record_path
+        elif url and url.strip():
+            # Check dns resolution of the url domain
+            domain = Youtubetranscription_summarizer.extract_domain(url)
+            if domain:
+                domaincheck = Youtubetranscription_summarizer.nslookup(domain)  # Check DNS resolution of the domain
+            else:
+                return "Invalid URL format."
+            if domaincheck:
+                # Check if the url is a youtube link
+                CheckURL = re.search(r"Youtube", url, re.IGNORECASE)
+                if CheckURL:
+                    # Get the transcription from youtube
+                    # text_input = Youtubetranscription_summarizer.main(url.strip()) # Youtube files are transcribed and summarized
+                    #extract_input = extract(url.strip()) # Call for local testing
+                    # Test wav file transcription using faster-whisper # Call for local testing
+                    #audio_wav = fetch_audio_from_youtube(extract_input) # Call for local testing
+                    audio_wav = fetch_audio_from_youtube(url.strip()) # Server API call
+                    #file_path = "/Users/sayedarizvi/AudioSummarizer/Data/test.wav" # Call for local testing
+                    #audio_wav = file_path # Call for local testing
+                    #text_input = Youtubetranscription_summarizer.transcribe_faster_whisper(extract_input, model_name="base.en")# Call for local testing
+                    text_input = Youtubetranscription_summarizer.transcribe_faster_whisper(audio_wav, model_name="base.en") #Call for server testing
+                    tmp_to_cleanup.append(text_input)
+                else:
+                    audio_path = download_to_temp_mp3(url.strip())
+                    tmp_to_cleanup.append(audio_path)
+            else:
+                return f"DNS lookup failed for {domain}"
+        if not audio_path and text_input is None:
+            return "Please provide content via upload, recording, or URL."
+        # Transcribe audio to text via faster-whisper before sending to gpt-4o-mini
+        # (gpt-4o-mini only accepts text/image_url content blocks, not audio)
+        if audio_path:
+            text_input = Youtubetranscription_summarizer.transcribe_faster_whisper(audio_path, model_name="base.en")
+        return summarize_input(None, text_input, sys_prompt, user_prompt, Starttime)
+    except Exception as e:
+        return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
+    finally:
+        for p in tmp_to_cleanup:
+            try:
+                if os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                pass
+# --- UI ---------------------------------------------------------------------
+with gr.Blocks(title="Audio Summarizer") as demo:
+    gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
+    gr.Markdown("Upload an mp3(**YouTube is the new feature add**), record audio, or paste a URL, use the default user prompt and system prompt and  click 'Summarize'.")
+    gr.Markdown("Users are encouraged to modify the user and system prompts to suit their needs.")
+    gr.Markdown("**Responsible Use**: This project is for educational and research purposes only. It does not intend to violate copyright, YouTube’s Terms of Service, or data rights. Users are responsible for ensuring compliance with applicable laws and platform policies when processing audio or video content. AudioSummarizer is designed as a learning tool to explore AI summarization workflows, not as a commercial service.")
+    with gr.Row():
+        with gr.Column():
+            upload_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload mp3")
+        with gr.Column():
+            record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
+        with gr.Column():
+            url_input = gr.Textbox(label="YouTube or standard mp3 URL", placeholder="https://example.com/audio.mp3")
+    ### Get system and user prompts from metadata.json file
+    file_name = 'metadata.json'
+    record_id = '1'
+    file_path = retrieve_file_path(file_name)
+    jsonrecord = retrieve_json_record(file_path, record_id)
+    if jsonrecord:
+        print(json.dumps(jsonrecord, indent=2))
+    else:
+        print("Record not found.")
+    sysprompt_default = jsonrecord['metadata']['content']['system_prompt']['content']
+    userprompt_default = jsonrecord['metadata']['content']['user_prompt']['content']
+    with gr.Row():
+        userprompt_input = gr.Textbox(
+            label="User Prompt",
+            #value="Summarize the audio content",
+            value=userprompt_default,
+            placeholder="e.g., Extract key points and action items",
+        )
+        sysprompt_input = gr.Textbox(
+            label="System Prompt",
+            #value="You are an AI assistant with a charter to clearly analyze the customer enquiry.",
+            value=sysprompt_default,
+        )
+    submit_btn = gr.Button("Summarize")
+    output = gr.Textbox(label="Summary", lines=12)
+    # Capture inputs for logging
+    if upload_audio:
+        upload_audio.change(
+            fn=lambda x: print(f"Upload audio selected: {x}"),
+            inputs=[upload_audio],
+            outputs=[],
+            # Reset other inputs to avoid confusion
+        )
+    if record_audio:
+        record_audio.change(
+            fn=lambda x: print(f"Record audio selected: {x}"),
+            inputs=[record_audio],
+            outputs=[],
+        )
+    if url_input:
+        url_input.change(
+            fn=lambda x: print(f"URL input changed: {x}"),
+            inputs=[url_input],
+            outputs=[],
+        )
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[upload_audio, record_audio, url_input, sysprompt_input, userprompt_input],
+        outputs=output,
+    )
+if __name__ == "__main__":
+    demo.launch()

app_v1.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import os
+import base64
+import tempfile
+import requests
+from datetime import datetime
+import gradio as gr
+from dotenv import load_dotenv
+from openai import AzureOpenAI  # official OpenAI SDK, works with Azure endpoints
+import json
+import subprocess # to execute youtube-dl version
+import Youtubetranscription_summarizer
+# --- LLM call (Azure OpenAI with API key) -----------------------------------
+def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> str:
+    """
+    Calls Azure OpenAI Chat Completions with audio input (base64 mp3).
+    """
+    load_dotenv()
+    endpoint = os.getenv("AC_OPENAI_ENDPOINT")
+    api_key = os.getenv("AC_OPENAI_API_KEY")
+    deployment = os.getenv("AC_MODEL_DEPLOYMENT")
+    api_version = os.getenv("AC_OPENAI_API_VERSION")
+    if not endpoint or not api_key or not deployment:
+        return "Server misconfiguration: required env vars missing."
+    try:
+        client = AzureOpenAI(
+            api_key=api_key,
+            api_version=api_version,
+            azure_endpoint=endpoint,
+        )
+        system_message = sys_prompt.strip() if sys_prompt else (
+            "You are an AI assistant with a charter to clearly analyze the customer enquiry."
+        )
+        user_text = user_prompt.strip() if user_prompt else "Summarize the audio content."
+        response = client.chat.completions.create(
+            model=deployment,
+            messages=[
+                {"role": "system", "content": system_message},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": user_text},
+                        {
+                            "type": "input_audio",
+                            #"input_audio": {"data": audio_b64, "format": "mp3"},
+                            "input_audio": {"data": audio_b64, "format": "wav"},
+                        },
+                    ],
+                },
+            ],
+        )
+        print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt)}, audio_size={len(audio_b64)}")
+        return response.choices[0].message.content
+    except Exception as ex:
+        return print(f"Error from Azure OpenAI: {ex}")
+        #pass
+#----Retrieve meta data from metadata.json file------------------------------
+def retrieve_file_path(file_name):
+    path = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(path, file_name)
+    if os.path.isfile(file_path):
+        return file_path
+    elif not os.path.exists(file_path):
+        print(f"'{file_path}' does not exist.")
+        return None
+    return None
+def retrieve_json_record(file_path, record_id):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+        if isinstance(data, list):
+            for record in data:
+                if record.get('metadata', {}).get('id') == record_id:
+                    return record
+        elif isinstance(data, dict):
+            if data.get('metadata', {}).get('id') == record_id:
+                return data
+    return None
+# --- I/O helpers ------------------------------------------------------------
+def encode_audio_from_path(path: str) -> str:
+    with open(path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+def download_to_temp_mp3(url: str) -> str:
+    r = requests.get(url, stream=True, timeout=30)
+    r.raise_for_status()
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
+        for chunk in r.iter_content(chunk_size=8192):
+            if chunk:
+                tmp.write(chunk)
+        return tmp.name
+def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
+    tmp_to_cleanup = []
+    try:
+        audio_path = None
+        if upload_path:
+            audio_path = upload_path
+        elif record_path:
+            audio_path = record_path
+        elif url and url.strip():
+            #audio_path = download_to_temp_mp3(url.strip())
+            audio_path = Youtubetranscription_summarizer.main(url.strip())
+            tmp_to_cleanup.append(audio_path)
+        if not audio_path:
+            return "Please provide an audio file via upload, recording, or URL."
+        audio_b64 = encode_audio_from_path(audio_path)
+        return summarize_audio_b64(audio_b64, sys_prompt, user_prompt)
+    except Exception as e:
+        return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
+    finally:
+        for p in tmp_to_cleanup:
+            try:
+                if os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                pass
+# --- UI ---------------------------------------------------------------------
+with gr.Blocks(title="Audio Summarizer") as demo:
+    gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
+    gr.Markdown("Upload a mp3, record audio, or paste a URL. The app sends base64 audio to Azure OpenAI.")
+    with gr.Row():
+        with gr.Column():
+            upload_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload mp3")
+        with gr.Column():
+            record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
+        with gr.Column():
+            url_input = gr.Textbox(label="mp3 URL", placeholder="https://example.com/audio.mp3")
+    ### Get system and user prompts from metadata.json file
+    file_name = 'metadata.json'
+    record_id = '1'
+    file_path = retrieve_file_path(file_name)
+    jsonrecord = retrieve_json_record(file_path, record_id)
+    if jsonrecord:
+        print(json.dumps(jsonrecord, indent=2))
+    else:
+        print("Record not found.")
+    sysprompt_default = jsonrecord['metadata']['content']['system_prompt']['content']
+    userprompt_default = jsonrecord['metadata']['content']['user_prompt']['content']
+    with gr.Row():
+        userprompt_input = gr.Textbox(
+            label="User Prompt",
+            #value="Summarize the audio content",
+            value=userprompt_default,
+            placeholder="e.g., Extract key points and action items",
+        )
+        sysprompt_input = gr.Textbox(
+            label="System Prompt",
+            #value="You are an AI assistant with a charter to clearly analyze the customer enquiry.",
+            value=sysprompt_default,
+        )
+    submit_btn = gr.Button("Summarize")
+    output = gr.Textbox(label="Summary", lines=12)
+    # Capture inputs for logging
+    if upload_audio:
+        upload_audio.change(
+            fn=lambda x: print(f"Upload audio selected: {x}"),
+            inputs=[upload_audio],
+            outputs=[],
+            # Reset other inputs to avoid confusion
+        )
+    if record_audio:
+        record_audio.change(
+            fn=lambda x: print(f"Record audio selected: {x}"),
+            inputs=[record_audio],
+            outputs=[],
+        )
+    if url_input:
+        url_input.change(
+            fn=lambda x: print(f"URL input changed: {x}"),
+            inputs=[url_input],
+            outputs=[],
+        )
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[upload_audio, record_audio, url_input, sysprompt_input, userprompt_input],
+        outputs=output,
+    )
+if __name__ == "__main__":
+    demo.launch()

extract/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

extract/Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+# ---------- Base ----------
+#FROM python:3.11-slim
+FROM cab337fa40e5acr.azurecr.io/python:3.11-slim
+# ---------- System deps ----------
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg ca-certificates curl \
+ && rm -rf /var/lib/apt/lists/*
+# ---------- Workdir ----------
+WORKDIR /workspace
+# ---------- Python deps ----------
+# requirements.txt is at AUDIOSUMMARIZER/extract/requirements.txt
+COPY app/requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip \
+&& pip install --no-cache-dir -r requirements.txt
+# ---------- App code ----------
+# Copy EVERYTHING under AUDIOSUMMARIZER/extract (includes subfolders: app/ and utils/)
+COPY . /workspace/extract
+# Make /workspace importable so "extract.app.Youtubeextraction" & "app.utils..." work
+ENV PYTHONPATH=/workspace
+# Runtime env (override at deploy)
+ENV HOST=0.0.0.0
+ENV PORT=8080
+ENV AZURE_STORAGE_ACCOUNT=__SET_AT_DEPLOY__
+ENV AZURE_BLOB_CONTAINER=__SET_AT_DEPLOY__
+ENV COOKIES_ACCOUNT=__SET_AT_DEPLOY__
+ENV COOKIES_CONTAINER=__SET_AT_DEPLOY__
+ENV COOKIES_BLOB=__SET_AT_DEPLOY__
+ENV COOKIES_PATH=__SET_AT_DEPLOY__
+ENV COOKIES_REFRESH_SEC=__SET_AT_DEPLOY__
+EXPOSE 8080
+# Your ASGI app is defined in extract/app/Youtubeextraction.py as `app = FastAPI()`
+CMD ["uvicorn", "extract.app.Youtubeextraction:app", "--host", "0.0.0.0", "--port", "8080"]

extract/__init__.py ADDED Viewed

File without changes

extract/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (153 Bytes). View file

extract/app/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

extract/app/Youtubeextraction.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os, tempfile, subprocess, re, json, shutil, time
+from fastapi import FastAPI, HTTPException
+from pathlib import Path
+from typing import Optional, Callable, Any
+import yt_dlp
+# from utils.storage import upload_and_sign   # To remove circular import issue
+from extract.utils.storage import upload_and_sign  # To remove circular import issue
+from extract.utils.retrieve_filepath import retrieve_file_path # To get the file path of cookies.txt
+from extract.utils.cookies_refresher import start_cookies_refresher # To refresh cookies.txt periodically
+app = FastAPI()
+def ensure_ffmpeg():
+    """
+    Verify that ffmpeg is available in PATH.
+    Raises RuntimeError with helpful guidance if missing.
+    Prints ffmpeg version to logs if found.
+    """
+    ffmpeg_path = shutil.which("ffmpeg")
+    if ffmpeg_path is None:
+        raise RuntimeError(
+            "FFmpeg not found in PATH.\n\n"
+            "👉 For Hugging Face Spaces:\n"
+            "   • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
+            "   • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
+            "Without ffmpeg, yt-dlp cannot extract/convert audio."
+        )
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-version"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            check=False,
+        )
+        print("✅ ffmpeg found at:", ffmpeg_path)
+        print(result.stdout.splitlines()[0])  # show first line of version info
+    except Exception as e:
+        raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")
+class YTDLPError(RuntimeError):
+    pass
+def _require(bin_name: str):
+    if shutil.which(bin_name) is None:
+        raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")
+@app.get("/health")
+def health():
+    return {"ok": True}
+@app.post("/extract")
+def extract(
+    youtube_url: str,
+    out_dir: Optional[str] = None,
+    target_sr: int = 16000,
+    target_channels: int = 1,
+    quiet: bool = True,
+    keep_intermediate: bool = False,
+    progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
+) -> str:
+    """
+    Download YouTube audio via yt_dlp's Python API, extract to WAV,
+    and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.
+    Args
+    ----
+    youtube_url : str
+    out_dir : Optional[str]    Directory for outputs (temp dir if None).
+    target_sr : int            Sample rate for final WAV (default 16000).
+    target_channels : int      Channels for final WAV (default 1 = mono).
+    quiet : bool               Suppress yt-dlp logs if True.
+    keep_intermediate : bool   Keep the pre-downsampled WAV if True.
+    progress_hook : callable   Optional yt-dlp progress hook.
+    Raises
+    ------
+    YTDLPError on failure.
+    """
+    if not youtube_url or not isinstance(youtube_url, str):
+        raise ValueError("youtube_url must be a non-empty string.")
+    _require("ffmpeg")  # we call ffmpeg ourselves
+    # yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly
+    work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
+    work_dir.mkdir(parents=True, exist_ok=True)
+    # First stage: let yt-dlp extract WAV (whatever SR/channels)
+    out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
+    hooks = [progress_hook] if progress_hook else []
+    ### Use cookies.txt if available
+    #cookies_path = retrieve_file_path("cookies.txt")
+    #cookies_path = "./app/utils/cookies.txt"
+    # Call the cookies refresher to start refreshing cookies in background
+    start_cookies_refresher()
+    cookies_path = os.getenv("COOKIES_PATH")
+    print(f"cookies_path value: {cookies_path}")
+    if not cookies_path:
+        cookies_path = None
+        print("Cookie file NOT found in container!")
+        return f"User authentication cookie file NOT found in container! Please try again later."
+    ydl_opts = {
+        "cookiefile": cookies_path,
+        "format": "bestaudio/best",
+        "outtmpl": out_template,
+        "noplaylist": True,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": "0",
+            }
+        ],
+        "quiet": quiet,
+        "verbose": not quiet,
+        "no_warnings": quiet,
+        "progress_hooks": hooks,
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.extract_info(youtube_url, download=True)
+    except Exception as e:
+        #raise YTDLPError(f"yt-dlp API failed: {e}") from e
+        return f"yt-dlp API failed: {e}"
+    # Locate the produced WAV (pre-downsampled)
+    pre_wavs = list(work_dir.glob("*.wav"))
+    if not pre_wavs:
+        #raise YTDLPError("yt-dlp completed but no WAV was found.")
+        return "yt-dlp completed but no WAV was found."
+    pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)
+    # Second stage: force 16 kHz mono via ffmpeg
+    final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
+    try:
+        subprocess.run(
+            [
+                "ffmpeg", "-y",
+                "-i", str(pre_wav),
+                "-ac", str(target_channels),
+                "-ar", str(target_sr),
+                str(final_wav),
+            ],
+            check=True,
+            stdout=subprocess.PIPE if quiet else None,
+            stderr=subprocess.PIPE if quiet else None,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        #raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
+        return f"ffmpeg failed to resample: {e.stderr or e.stdout}"
+    # 3) upload + sign (short-lived)
+    signed = upload_and_sign(final_wav, ttl_minutes=45)
+    # Clean up intermediates if desired
+    if not keep_intermediate:
+        try:
+            if pre_wav.exists() and pre_wav != final_wav:
+                pre_wav.unlink()
+        except Exception:
+            pass
+    return signed

extract/app/__init__.py ADDED Viewed

File without changes

extract/app/__pycache__/Youtubeextraction.cpython-313.pyc ADDED Viewed

Binary file (7.25 kB). View file

extract/app/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (157 Bytes). View file

extract/app/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+dotenv==0.9.9
+requests==2.32.5
+azure-identity==1.25.0
+yt_dlp==2025.9.23
+fastapi
+uvicorn[standard]==0.30.6
+azure-storage-blob==12.20.0

extract/utils/__init__.py ADDED Viewed

File without changes

extract/utils/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (149 Bytes). View file

extract/utils/__pycache__/cookies_refresher.cpython-313.pyc ADDED Viewed

Binary file (4.27 kB). View file

extract/utils/__pycache__/retrieve_filepath.cpython-313.pyc ADDED Viewed

Binary file (890 Bytes). View file

extract/utils/__pycache__/storage.cpython-313.pyc ADDED Viewed

Binary file (2.76 kB). View file

extract/utils/cookies_refresher.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os, time, hashlib, tempfile, threading
+from azure.identity import DefaultAzureCredential
+from azure.storage.blob import BlobClient
+from dotenv import load_dotenv
+load_dotenv()
+ACCOUNT  = os.getenv("AZURE_STORAGE_ACCOUNT","yt-extractor-rg")  # storage account name
+CONTAINER= os.getenv("COOKIES_CONTAINER","cookies")        # container name
+BLOB     = os.getenv("COOKIES_BLOB","cookies.txt")     # blob name
+OUT_PATH = os.getenv("COOKIES_PATH","/tmp/cookies.txt")  # local path to write cookies
+REFRESH = int(os.getenv("COOKIES_REFRESH_SEC", "600"))  # Default to 10 minutes
+def _sha256(b: bytes) -> str: return hashlib.sha256(b).hexdigest()
+def _read(path: str) -> bytes:
+    try:
+        with open(path, "rb") as f: return f.read()
+    except: return b""
+def _atomic_write(path: str, data: bytes):
+    d = os.path.dirname(path) or "."
+    os.makedirs(d, exist_ok=True)
+    fd, tmp = tempfile.mkstemp(prefix=".cookies.", dir=d)
+    with os.fdopen(fd, "wb") as f: f.write(data)
+    os.replace(tmp, path)
+    try: os.chmod(path, 0o600)
+    except: pass
+def refresh_once():
+    if not ACCOUNT:
+        print("[cookies] ACCOUNT not set"); return
+    cred = DefaultAzureCredential()  # uses ACA managed identity
+    bc = BlobClient(
+        account_url=f"https://{ACCOUNT}.blob.core.windows.net",
+        container_name=CONTAINER,
+        blob_name=BLOB,
+        credential=cred,
+    )
+    new = bc.download_blob(max_concurrency=1).readall()
+    if not new.strip():
+        print("[cookies] WARN: blob is empty; skipping")
+        return
+    if _sha256(new) != _sha256(_read(OUT_PATH)):
+        _atomic_write(OUT_PATH, new)
+        print(f"[cookies] updated -> {OUT_PATH} (bytes={len(new)})")
+def start_cookies_refresher():
+    # initial fetch before serving traffic
+    try: refresh_once()
+    except Exception as e: print(f"[cookies] initial refresh error: {e}")
+    # periodic refresh
+    def loop():
+        while True:
+            time.sleep(REFRESH)
+            try: refresh_once()
+            except Exception as e: print(f"[cookies] refresh error: {e}")
+    threading.Thread(target=loop, daemon=True).start()

extract/utils/probeytdlp.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env python3
+import yt_dlp, traceback, sys, os
+from http.cookiejar import MozillaCookieJar
+class YDLLogger:
+    def debug(self, msg): print("[DEBUG]", msg)
+    def warning(self, msg): print("[WARN]", msg)
+    def error(self, msg): print("[ERROR]", msg)
+def probe(url, cookies=None):
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "cachedir": False,
+        "logger": YDLLogger(),
+        "no_warnings": False,
+        "quiet": False,
+        # don't try postprocessing during probe
+        "postprocessors": [],
+        # helpful to mimic a browser if site is picky:
+        "http_headers": {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"},
+    }
+    if cookies:
+        ydl_opts["cookiefile"] = cookies
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            print("Probing (no download)...")
+            info = ydl.extract_info(url, download=False)
+            print("Top-level keys in info:", list(info.keys()))
+            formats = info.get("formats")
+            if formats:
+                print("Found formats (count):", len(formats))
+                for f in formats[:10]:
+                    print(f" - id={f.get('format_id')}, ext={f.get('ext')}, abr={f.get('abr')}, vbr={f.get('vbr')}, note={f.get('format_note')}")
+            else:
+                print("No formats found. Inspecting other info fields:")
+                for k in ("webpage_url", "extractor", "requested_formats", "is_live", "entries"):
+                    print(f"  {k}: {info.get(k)}")
+            return info
+    except Exception as e:
+        print("EXCEPTION during probe:")
+        traceback.print_exc()
+        # also dump any HTML/diagnostic text if available in exception text
+        print("Exception message:", str(e))
+if __name__ == "__main__":
+    cookies = None
+    if len(sys.argv) > 1:
+        cookies = sys.argv[1]
+        if not os.path.isfile(cookies):
+            print(f"Cookie file '{cookies}' not found.")
+            sys.exit(1)
+    url = "https://www.youtube.com/watch?v=wDchsz8nmbo"
+    probe(url, cookies)

extract/utils/retrieve_filepath.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+def retrieve_file_path(file_name):
+    path = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(path, file_name)
+    if os.path.isfile(file_path):
+        return file_path
+    elif not os.path.exists(file_path):
+        print(f"'{file_path}' does not exist.")
+        return None
+    return None

extract/utils/storage.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from dotenv import load_dotenv
+import os, uuid
+from datetime import datetime, timedelta, timezone
+from azure.identity import ManagedIdentityCredential, DefaultAzureCredential
+from azure.storage.blob import (
+    BlobServiceClient, generate_blob_sas, BlobSasPermissions
+)
+load_dotenv()
+ACCOUNT_NAME = os.getenv("AZURE_STORAGE_ACCOUNT","ytstore7135")
+CONTAINER = os.getenv("AZURE_BLOB_CONTAINER","audio")
+# Use Managed Identity in Azure; locally DefaultAzureCredential also works
+def _credential():
+    # Tries MI in Azure; falls back to developer creds locally
+    return DefaultAzureCredential(exclude_interactive_browser_credential=False)
+def _svc_client():
+    url = f"https://{ACCOUNT_NAME}.blob.core.windows.net"
+    return BlobServiceClient(account_url=url, credential=_credential())
+def upload_and_sign(local_path: str, ttl_minutes: int = 45) -> str:
+    svc = _svc_client()
+    name = f"{uuid.uuid4()}/{os.path.basename(local_path)}"
+    blob = svc.get_blob_client(container=CONTAINER, blob=name)
+    with open(local_path, "rb") as f:
+        blob.upload_blob(f, overwrite=True, content_type="audio/wav")
+    # Get User Delegation Key (no account key needed)
+    udk = svc.get_user_delegation_key(
+        key_start_time=datetime.now(timezone.utc) - timedelta(minutes=5),
+        key_expiry_time=datetime.now(timezone.utc) + timedelta(hours=2),
+    )
+    sas = generate_blob_sas(
+        account_name=ACCOUNT_NAME,
+        container_name=CONTAINER,
+        blob_name=name,
+        user_delegation_key=udk,
+        permission=BlobSasPermissions(read=True),
+        expiry=datetime.now(timezone.utc) + timedelta(minutes=ttl_minutes),
+    )
+    return f"{blob.url}?{sas}"

gradio_client_audichattranscriber.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from datetime import datetime
+import gradio as gr
+from dotenv import load_dotenv
+from gradio_client import Client  # Gradio client for Hugging Face models
+def main():
+    """
+    Calls Gradio app hosted on Hugging Face using Gradio client.
+    """
+    load_dotenv() # Load .env file for HF token if needed
+    try:
+        client = Client("samir72/AudioChatTranscriber")  # Hugging Face model with Gradio app
+        #client.view_api()  # View available API endpoints
+        response = client.predict(
+			upload_path=None,
+            record_path=None,
+            url="https://audio-samples.github.io/samples/mp3/blizzard_biased/sample-0.mp3",
+			sys_prompt="You are an AI assistant with a listening charter to clearly analyze the customer enquiry.",
+			user_prompt="Summarize the audio content",
+			api_name="/process_audio"
+        )
+        print(f"Gradio API call at {datetime.now()}")
+        print(f"Summarized Output : {response}")
+        return response
+    except Exception as ex:
+        return print(f"Error calling Gradio app: {ex}")
+        #pass
+if __name__ == "__main__":
+    main()

metadata.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "metadata":
+  {
+    "id": "1",
+    "timestamp": "2025-09-18T11:24:00Z",
+    "type": "prompt_metadata",
+    "content": {
+      "system_prompt": {
+        "title": "System Prompt",
+        "content": "You are a highly capable AI assistant designed to process and analyze multimodal inputs, including audio and text. Your task is to generate a structured response based on the provided input, following a specific format. When an image is uploaded along with a text prompt, analyze the image content and integrate it with the text to produce a coherent response. Use the following format for your response: - **Summary**: Provide a concise summary of the main points from the text and image (if applicable), limited to 2-3 sentences. - **Key Details**: List 3-5 key details extracted from the text and image, presented as bullet points. - **Insights**: Offer 1-2 insightful observations or conclusions based on the content, keeping it brief. Ensure your response is accurate, relevant, and tailored to the input. If the input lacks sufficient information, acknowledge the limitation and provide a general response based on the available data.",
+        "version": "1.0",
+        "created_at": "2025-09-18T1:24:00Z"
+      },
+      "user_prompt": {
+        "title": "User Prompt",
+        "content": "Analyze the audio file to provide a structured response. Make sure to include a summary, key details, and insights based on the combined information.",
+        "version": "1.0",
+        "created_at": "2025-09-18T1:25:00Z"
+      }
+    },
+    "tags": ["AI", "multimodal", "podcast", "AWS"],
+    "status": "active"
+},
+    "metadata_2":
+  {
+    "id": "2",
+    "timestamp": "2025-09-18T11:24:00Z",
+    "type": "prompt_metadata",
+    "content": {
+      "system_prompt": {
+        "title": "System Prompt",
+        "content": "You are a highly capable AI assistant designed to process and analyze multimodal inputs, including text and images. Your task is to generate a structured response based on the provided input, following a specific format. When an image is uploaded along with a text prompt, analyze the image content and integrate it with the text to produce a coherent response. Use the following format for your response: - **Summary**: Provide a concise summary of the main points from the text and image (if applicable), limited to 2-3 sentences. - **Key Details**: List 3-5 key details extracted from the text and image, presented as bullet points. - **Insights**: Offer 1-2 insightful observations or conclusions based on the content, keeping it brief. Ensure your response is accurate, relevant, and tailored to the input. If the input lacks sufficient information, acknowledge the limitation and provide a general response based on the available data.",
+        "version": "1.0",
+        "created_at": "2025-09-18T11:00:00Z"
+      },
+      "user_prompt": {
+        "title": "User Prompt",
+        "content": "Analyze the attached image and the following text to provide a structured response. Make sure to include a summary, key details, and insights based on the combined information.",
+        "version": "1.0",
+        "created_at": "2025-09-18T11:10:00Z"
+      }
+    },
+    "tags": ["AI", "multimodal", "podcast", "AWS"],
+    "status": "active"
+  }
+}

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+dotenv==0.9.9
+gradio==5.46.1
+requests==2.32.5
+azure-identity==1.25.0
+azure-ai-projects==1.0.0
+numpy==1.26.4
+openai==1.107.3
+yt_dlp==2025.9.23
+faster_whisper==1.2.0
+fastapi
+uvicorn[standard]==0.30.6
+azure-storage-blob==12.20.0