GitHub Actions commited on
Commit
92ddce4
·
0 Parent(s):

Clean sync from GitHub - no large files in history

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.github/workflows/main.yml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-space:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 1 # Shallow clone to avoid large files in history
17
+ lfs: false # Don't fetch LFS files since we don't use them
18
+
19
+ - name: Push to Hugging Face Space
20
+ env:
21
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
22
+ run: |
23
+ # Fail loudly and show each command
24
+ set -euxo pipefail
25
+ # Configure git
26
+ git config --global user.email "actions@github.com"
27
+ git config --global user.name "GitHub Actions"
28
+ git config --global credential.helper ""
29
+ export GIT_TERMINAL_PROMPT=0
30
+ echo "Current branch:"
31
+ git branch --show-current || true
32
+ echo "Git remotes:"
33
+ git remote -v
34
+ # Add/replace remote with token auth (note 'user' here)
35
+ git remote remove hf 2>/dev/null || true
36
+ git remote add hf "https://user:${HF_TOKEN}@huggingface.co/spaces/samir72/AudioChatTranscriber"
37
+ echo "Testing authentication with git ls-remote..."
38
+ git ls-remote hf
39
+ echo "Creating fresh orphan branch without history..."
40
+ # Create a new branch with only current state (no history with large files)
41
+ git checkout --orphan temp-clean-branch
42
+ git add -A
43
+ git commit -m "Clean sync from GitHub - no large files in history"
44
+ echo "Force pushing clean branch to HF Space..."
45
+ git push --force hf temp-clean-branch:main
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ bin
2
+ obj
3
+ *.sln
4
+ .env*
5
+ venv/
FoundationCode.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add references
2
+ from azure.identity import DefaultAzureCredential
3
+ from azure.ai.projects import AIProjectClient
4
+ import gradio as gr
5
+ from dotenv import load_dotenv
6
+ import requests
7
+ import os
8
+ import tempfile
9
+ import base64
10
+
11
+ # Placeholder for your summarization function.
12
+ # Replace this with your actual function that takes a WAV file path and returns the summary.
13
+ def summarize_audio(audio_data,sysprompt,userprompt):
14
+ # Code to summarize the audio file using LLM and Azure OpenAI
15
+
16
+ try:
17
+
18
+ # Get configuration settings
19
+ load_dotenv()
20
+ project_endpoint = os.getenv("AC_PROJECT_ENDPOINT")
21
+ model_deployment = os.getenv("AC_MODEL_DEPLOYMENT")
22
+
23
+ # Initialize the project client
24
+ project_client = AIProjectClient(
25
+ credential=DefaultAzureCredential(
26
+ exclude_environment_credential=True,
27
+ exclude_managed_identity_credential=True
28
+ ),
29
+ endpoint=project_endpoint,
30
+ )
31
+
32
+
33
+ # Get a chat client
34
+ openai_client = project_client.get_openai_client(api_version="2024-10-21")
35
+
36
+
37
+ # Initialize prompts
38
+ if sysprompt:
39
+ system_message = sysprompt
40
+ else:
41
+ system_message = "You are an AI assistant with a charter to clearly analyse the customer enquiry."
42
+
43
+ prompt = ""
44
+
45
+ # Loop until the user types 'quit'
46
+ while True:
47
+ #prompt = input("\nAsk a question about the audio\n(or type 'quit' to exit)\n")
48
+ if userprompt:
49
+ prompt = userprompt
50
+ else:
51
+ prompt = "quit"
52
+ if prompt.lower() == "quit":
53
+ break
54
+ elif len(prompt) == 0:
55
+ print("Please enter a question.\n")
56
+ else:
57
+ print("Getting a response ...\n")
58
+
59
+ # Encode the audio file
60
+ #audio_data = encode_audio(wav_path)
61
+
62
+ # Get a response to audio input
63
+ response = openai_client.chat.completions.create(
64
+ model=model_deployment,
65
+ messages=[
66
+ {"role": "system", "content": system_message},
67
+ { "role": "user",
68
+ "content": [
69
+ {
70
+ "type": "text",
71
+ "text": prompt
72
+ },
73
+ {
74
+ "type": "input_audio",
75
+ "input_audio": {
76
+ "data": audio_data,
77
+ "format": "mp3"
78
+ }
79
+ }
80
+ ] }
81
+ ]
82
+ )
83
+ print(response.choices[0].message.content)
84
+ userprompt = ""
85
+
86
+ except Exception as ex:
87
+ print(ex)
88
+ return response.choices[0].message.content
89
+
90
+ def encode_audio(audio_file,action):
91
+ """Encode audio files in the specified folder to base64."""
92
+ try:
93
+ if action == "Read":
94
+ with open(audio_file, 'rb') as audio_file:
95
+ audio_data = base64.b64encode(audio_file.read()).decode('utf-8')
96
+ return audio_data
97
+ elif action == "Download":
98
+ audio_data = base64.b64encode(audio_file).decode('utf-8')
99
+ return audio_data
100
+
101
+ except Exception as e:
102
+ raise ValueError(f"Failed to encode audio file: {str(e)}")
103
+
104
+ def download_wav_from_url(url):
105
+ if not url:
106
+ return None
107
+ try:
108
+ response = requests.get(url, stream=True)
109
+ response.raise_for_status()
110
+ return response.content
111
+ except Exception as e:
112
+ raise ValueError(f"Failed to download WAV from URL: {str(e)}")
113
+
114
+ def process_audio(upload_audio, record_audio, url,sysprompt,userprompt):
115
+ wav_path = None
116
+ temp_files = [] # To clean up temp files later if needed
117
+
118
+ if upload_audio:
119
+ wav_path = upload_audio
120
+ audio_data = encode_audio(wav_path,"Read")
121
+ elif record_audio:
122
+ wav_path = record_audio
123
+ audio_data = encode_audio(wav_path,"Read")
124
+ elif url:
125
+ wav_path = download_wav_from_url(url)
126
+ audio_data = encode_audio(wav_path,"Download")
127
+ if audio_data:
128
+ temp_files.append(audio_data)
129
+
130
+ if not wav_path:
131
+ return "Please provide an audio file via upload, recording, or URL."
132
+
133
+ try:
134
+ summary = summarize_audio(audio_data,sysprompt,userprompt)
135
+ return summary
136
+ finally:
137
+ # Optional: Clean up temp files
138
+ for temp in temp_files:
139
+ if os.path.exists(temp):
140
+ os.remove(temp)
141
+
142
+ with gr.Blocks(title="Audio Summarizer UI") as demo:
143
+ gr.Markdown("# Audio File Summarizer")
144
+ gr.Markdown("Upload a WAV file, record audio, or provide a URL to a WAV file for summarization.")
145
+
146
+ with gr.Row():
147
+ with gr.Column():
148
+ upload_audio = gr.Audio(sources="upload", type="filepath", label="Upload WAV File")
149
+ with gr.Column():
150
+ record_audio = gr.Audio(sources="microphone", type="filepath", label="Record Audio")
151
+ with gr.Column():
152
+ url_input = gr.Textbox(label="Enter URL to WAV File", placeholder="https://example.com/audio.wav")
153
+ with gr.Column():
154
+ userprompt_input = gr.Textbox(label="Enter User Prompt", placeholder="Ask a question about the audio",value="Summarize the audio content")
155
+ with gr.Column():
156
+ sysprompt_input = gr.Textbox(label="Enter System Prompt",value="You are an AI assistant with a listening charter to clearly analyse the customer enquiry.")
157
+
158
+ submit_btn = gr.Button("Summarize")
159
+ output = gr.Textbox(label="Summary", lines=10)
160
+
161
+ submit_btn.click(
162
+ fn=process_audio,
163
+ inputs=[upload_audio, record_audio, url_input,sysprompt_input,userprompt_input],
164
+ outputs=output
165
+ )
166
+
167
+ demo.launch()
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sayed A Rizvi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AudioSummarizer
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.46.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # AudioSummarizer
14
+
15
+ ## What’s New (May 3, 2026)
16
+ - **Gradio upgraded to 5.46.1** to resolve a pip dependency conflict between `gradio==5.45.0` and `gradio==5.46.1`.
17
+ - **GitHub Actions CI/CD workflow added** — every push to `main` automatically syncs the repo to the Hugging Face Space (`samir72/AudioChatTranscriber`). Requires an `HF_TOKEN` secret configured in the GitHub repository settings.
18
+ - **Model switched to gpt-4o-mini** — replaced Phi-4-multimodal-instruct with `gpt-4o-mini` via Azure OpenAI to resolve `DeploymentNotFound` errors. Set `AC_MODEL_DEPLOYMENT=gpt-4o-mini` in your environment or HF Space secrets.
19
+ - **Audio pipeline updated for gpt-4o-mini compatibility** — `gpt-4o-mini` does not support native audio content blocks. All audio inputs (upload, microphone, direct MP3 URL) are now transcribed locally via **faster-whisper** before being sent to the model as text, eliminating the `400 invalid_request_error`.
20
+
21
+ ---
22
+
23
+ ## What’s New (Sep 26–28, 2025)
24
+ - **YouTube cookie refresh & expiry handling** added to avoid sign-in/download failures.
25
+ - **DNS lookup improvements**: automatically skip DNS failures on Hugging Face Spaces to reduce false negatives.
26
+ - **Azure Container App (ACA) integration**: bypasses YouTube blocking by offloading audio download to Azure, storing audio in Blob Storage, and feeding it into the HF pipeline.
27
+ - **Docker / ACA enhancements**: uses Microsoft slim base image in ACR for faster builds, with trade-off that the base must be regularly refreshed.
28
+ - **Repo restructuring**: renamed the app entry folder to `extract/` to resolve a Hugging Face build conflict.
29
+
30
+ ---
31
+
32
+ ## Overview
33
+ AudioSummarizer is a web app (deployed on Hugging Face Spaces) that summarizes audio from multiple sources — file upload, microphone, or URL (YouTube / direct MP3) — using **gpt-4o-mini** via Azure OpenAI for structured summarization. The app uses **faster‑whisper** for transcription and **yt-dlp** + **ffmpeg** for audio extraction, with a clean **Gradio** UI. Prompts are loaded from `metadata.json` to ensure replies include **Summary**, **Key Details**, and **Insights**.
34
+
35
+ Because Hugging Face often cannot directly fetch YouTube audio (due to network restrictions or blocking), we now route YouTube downloads through an **Azure Container App** which:
36
+
37
+ 1. Fetches the YouTube audio independently.
38
+ 2. Stores the processed 16 kHz mono WAV file in **Azure Blob Storage**.
39
+ 3. Serves that file into the usual transcription/summarization pipeline in the HF app.
40
+
41
+ Thus, the HF interface remains unchanged to users, but YouTube support is restored reliably via Azure.
42
+
43
+ ---
44
+
45
+ ## Features
46
+ - Upload a local MP3 file, record via microphone, or enter a YouTube / MP3 URL.
47
+ - **Azure Container App support** so YouTube content is reliably processed even if Hugging Face cannot fetch it.
48
+ - Prompts fully customizable: you may define system and user prompts stored in `metadata.json`.
49
+ - Transcription using **faster-whisper**, summarization through **gpt-4o-mini** (Azure OpenAI).
50
+ - Clean and minimal **Gradio** UI for intuitive interaction.
51
+ - Configuration via environment variables (`.env`) for Azure endpoint, deployment name, API key, etc.
52
+ - YouTube audio extraction to **16 kHz mono WAV** (via yt-dlp + ffmpeg).
53
+ - DNS‑based URL validation, with automatic skip of DNS errors in HF Spaces to reduce false rejections.
54
+
55
+ ---
56
+
57
+ ## Architecture / Data Flow
58
+
59
+ ```
60
+ User Input (YouTube) ──▶ Hugging Face UI
61
+
62
+ └── If URL is YouTube:
63
+ ─▶ forwarded to Azure Container App
64
+ ├── ACA downloads YouTube audio (yt-dlp)
65
+ └── Converts/stores WAV in Azure Blob Storage
66
+ ─▶ HF app fetches WAV from Blob Storage
67
+ ├── Transcribe via faster-whisper
68
+ └── Summarize via Azure gpt-4o-mini
69
+
70
+
71
+ ┌───────────────┐ file/mic/url ┌───────────────────────────┐
72
+ │ Gradio UI │─────────────▶│ process_audio(...) │
73
+ └──────┬────────┘ └──────────┬─────────────────┘
74
+ │ validates/reads │
75
+ ▼ ▼
76
+ ┌───────────────────────────┐ ┌─────────────────────────────┐
77
+ │ summarize_input(audio,...)│──▶│ Azure gpt-4o-mini │
78
+ └───────────────────────────┘ │ Chat Completions (text+audio)│
79
+ └─────────────────────────��───┘
80
+
81
+ YouTube Path (via ACA):
82
+ ┌───────────────┐ YouTube URL ┌──────────────────────────────┐
83
+ │ Gradio UI │────────────▶ │ Azure Container App (yt-dlp) │
84
+ └───────────────┘ └──────────┬───────────────────┘
85
+ │ uploads audio
86
+
87
+ ┌──────────────────────────────┐
88
+ │ Azure Blob Storage (WAV 16k) │
89
+ └──────────┬───────────────────┘
90
+
91
+
92
+ ┌──────────────────────────────┐
93
+ │ faster-whisper transcription │
94
+ └──────────┬───────────────────┘
95
+ │ text
96
+
97
+ ┌──────────────────────────────┐
98
+ │ Azure gpt-4o-mini │
99
+ │ summarization │
100
+ └──────────────────────────────┘
101
+
102
+ ```
103
+
104
+ For non-YouTube inputs (local upload, mic, direct MP3 URL), the flow remains internal to the HF space: download/convert → transcription → summarization.
105
+
106
+ ---
107
+
108
+ ## CI/CD — GitHub Actions
109
+
110
+ A workflow at `.github/workflows/main.yml` runs on every push to `main` (and can be triggered manually via `workflow_dispatch`).
111
+
112
+ **What it does:**
113
+ 1. Checks out the repo with a shallow clone (no LFS, no full history).
114
+ 2. Creates a clean orphan branch — only the current file state, no large-file history.
115
+ 3. Force-pushes that branch to the `main` branch of the Hugging Face Space `samir72/AudioChatTranscriber`.
116
+
117
+ **Setup requirement:** Add an `HF_TOKEN` secret in **GitHub → Settings → Secrets and variables → Actions** with a Hugging Face token that has write access to the Space.
118
+
119
+ ---
120
+
121
+ ## Docker & Azure Container Apps
122
+
123
+ ### Optimization: Microsoft Slim Base in ACR
124
+ The Docker image now uses a **Microsoft slim base image** hosted in **Azure Container Registry (ACR)** to speed up builds (less reliance on external pulls).
125
+ - ✅ **Advantage**: faster, more predictable builds in Azure / CI.
126
+ - ⚠️ **Caveat**: you must **refresh the slim base in ACR routinely** to catch upstream security patches, updates, or bug fixes.
127
+
128
+ **Best Practice Recommendation:**
129
+ Set up a scheduled job (e.g. via ACR Task or Azure DevOps pipeline) to pull the latest Microsoft slim base and update your ACR copy on a regular cadence (e.g. weekly) so your deployed containers remain current.
130
+
131
+ ### Build & Run Example
132
+ ```bash
133
+ # Build locally
134
+ docker build -t audiosummarizer:latest .
135
+
136
+ # Run container
137
+ docker run --rm -p 7860:7860 -e AC_OPENAI_ENDPOINT=... -e AC_MODEL_DEPLOYMENT=... -e AC_OPENAI_API_KEY=... -e AC_OPENAI_API_VERSION=... audiosummarizer:latest
138
+ ```
139
+
140
+ For ACA deployment:
141
+ 1. Push the Docker image to your ACR.
142
+ 2. Deploy the image via **Azure Container Apps** with necessary environment variables.
143
+ 3. The ACA will serve as the YouTube‐to‑Blob “fetcher” component, supporting the main HF app.
144
+
145
+ ---
146
+
147
+ ## Prerequisites
148
+ - Python **3.10+**
149
+ - Azure subscription with deployment of **gpt-4o-mini**
150
+ - `ffmpeg` installed and in `$PATH`
151
+ - A valid `metadata.json` containing default prompts
152
+ - For HF spaces: `packages.txt` including `ffmpeg`
153
+
154
+ ---
155
+
156
+ ## Python Dependencies
157
+ Add to `requirements.txt`:
158
+ ```
159
+ azure-identity>=1.17.1
160
+ openai>=1.0.0
161
+ gradio>=4.44.0
162
+ python-dotenv>=1.0.1
163
+ requests>=2.32.3
164
+ yt-dlp>=2024.8.6
165
+ faster-whisper>=0.10.0
166
+ beautifulsoup4>=4.12.2 # optional, for fallback scraping
167
+ ```
168
+
169
+ Install as usual:
170
+ ```bash
171
+ python -m venv .venv
172
+ source .venv/bin/activate # on Windows: .venv\Scripts\activate
173
+ pip install -r requirements.txt
174
+ ```
175
+
176
+ ---
177
+
178
+ ## Installation
179
+ ```bash
180
+ git clone https://github.com/samir72/AudioSummarizer.git
181
+ cd AudioSummarizer
182
+ ```
183
+ Install dependencies and make sure `ffmpeg` is available (or included via `packages.txt` in HF deployment).
184
+
185
+ ---
186
+
187
+ ## Configuration
188
+ Create a `.env` file at the project root:
189
+ ```env
190
+ AC_OPENAI_ENDPOINT=https://<your-azure-resource>.openai.azure.com/
191
+ AC_MODEL_DEPLOYMENT=<your‑phi‑4 deployment name>
192
+ AC_OPENAI_API_KEY=<your azure openai api key>
193
+ AC_OPENAI_API_VERSION=<api version e.g. 2024-10-01>
194
+
195
+ GRADIO_SERVER_NAME=127.0.0.1
196
+ GRADIO_SERVER_PORT=7860
197
+ ```
198
+
199
+ If you’re running the Azure Container App, ensure it is configured with:
200
+ - Proper role / access to write to Azure Blob Storage
201
+ - Environment variables for any keys or connection strings it needs
202
+ - Networking/firewall settings so the HF app can fetch from the blob store
203
+
204
+ ---
205
+
206
+ ## Usage
207
+ Run the app:
208
+ ```bash
209
+ python app.py
210
+ ```
211
+ Then open your browser to [http://127.0.0.1:7860](http://127.0.0.1:7860) or use your HF Space URL.
212
+
213
+ ### Input options
214
+ - Upload MP3 file
215
+ - Record via microphone
216
+ - Enter a YouTube / direct MP3 URL
217
+ - Modify system/user prompts (via `metadata.json`)
218
+ - Click **Summarize** → get structured output (Summary, Key Details, Insights)
219
+
220
+ ---
221
+
222
+ ## Contributing
223
+ We welcome your improvements—especially around cloud integration, performance, and reliability.
224
+
225
+ **Suggested contribution areas:**
226
+ - Better error handling for cookie expiry, fallback strategies
227
+ - Enhancements to the Azure Container App + Blob Storage pipeline
228
+ - Caching / sync between ACA and the HF app
229
+ - Automation of **ACR slim base refresh**
230
+
231
+
232
+ **How to contribute:**
233
+ 1. Fork the repository
234
+ 2. Create a feature branch (e.g. `git checkout -b feat/xyz`)
235
+ 3. Commit changes with meaningful messages
236
+ 4. Push and open a Pull Request
237
+
238
+ Please reference this `README.md` when describing how the YouTube → ACA → Blob → HF flow works.
239
+
240
+ ---
241
+
242
+ ## License
243
+ This project is licensed under the **MIT License** — see [LICENSE](./LICENSE) for details.
244
+
245
+ ---
246
+
247
+ ## Acknowledgments
248
+ - Built with **Gradio** for UI
249
+ - Application deployed on **Hugging Face Spaces**
250
+ - ACA deployed on **Azure**
251
+ - Application layer on ACA served by **FastAPI**
252
+ - Intelligence by **Azure gpt-4o-mini**
253
+ - YouTube audio extraction with **yt-dlp**
254
+ - Transcription enabled by **faster-whisper**
255
+
256
+ ---
257
+
258
+
259
+ ## Contact
260
+ For questions or feedback, reach out to **Sayed Amir Rizvi**
261
+ Email: syedamirhusain@gmail.com
Youtubetranscription_summarizer.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, tempfile, subprocess, json, re, time, shutil
2
+ from pathlib import Path
3
+ from typing import Optional, Callable, Any
4
+ import yt_dlp
5
+ from faster_whisper import WhisperModel
6
+ import socket
7
+
8
+
9
+ def main(url:str):
10
+ # Get YouTube URL from user
11
+ ensure_ffmpeg()
12
+ url = get_video_id(url)
13
+ #Pass the URL to download audio and convert to wav
14
+ wav_path = download_youtube_audio_wav16k_api(url)
15
+ #Transcribe the audio wav file
16
+ transcript = transcribe_faster_whisper(wav_path, model_name="base.en")
17
+ #print(f"Transcription completed. Language: {transcript['language']}")
18
+ #print(json.dumps(transcript, indent=2))
19
+ #Summarize the transcript using Phi
20
+ return transcript
21
+
22
+ def nslookup(domain):
23
+ try:
24
+ # Perform DNS lookup for the domain
25
+ addresses = socket.getaddrinfo(domain, None)
26
+ print(f"DNS lookup succesfull for {domain}:")
27
+ return True
28
+ # for addr in addresses:
29
+ # # Extract IP address from the result
30
+ # ip = addr[4][0]
31
+ # print(f"IP Address: {ip}")
32
+ except socket.gaierror as e:
33
+ print(f"DNS lookup failed for {domain}: {e}")
34
+ return True # Assume true as youtube DNS will fail on huggingface
35
+ except Exception as e:
36
+ print(f"An unexpected error occurred: {e}")
37
+ return False
38
+
39
+ def extract_domain(url):
40
+ # Regular expression to match the domain name
41
+ # Matches http:// or https://, followed by the domain (e.g., audio-samples.github.io)
42
+ pattern = r'https?://([a-zA-Z0-9.-]+)'
43
+ match = re.search(pattern, url)
44
+ if match:
45
+ return match.group(1)
46
+ else:
47
+ return None
48
+
49
+ def get_video_id(url:str)->str:
50
+ # Extract video ID from various YouTube URL formats
51
+ m = re.search(r"(?:v=|/shorts/|/live/|/embed/)([A-Za-z0-9_-]{6,})", url)
52
+ return m.group(1) if m else str(abs(hash(url)))
53
+
54
+ def ensure_ffmpeg():
55
+ """
56
+ Verify that ffmpeg is available in PATH.
57
+ Raises RuntimeError with helpful guidance if missing.
58
+ Prints ffmpeg version to logs if found.
59
+ """
60
+ ffmpeg_path = shutil.which("ffmpeg")
61
+ if ffmpeg_path is None:
62
+ raise RuntimeError(
63
+ "FFmpeg not found in PATH.\n\n"
64
+ "👉 For Hugging Face Spaces:\n"
65
+ " • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
66
+ " • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
67
+ "Without ffmpeg, yt-dlp cannot extract/convert audio."
68
+ )
69
+
70
+ try:
71
+ result = subprocess.run(
72
+ ["ffmpeg", "-version"],
73
+ stdout=subprocess.PIPE,
74
+ stderr=subprocess.STDOUT,
75
+ text=True,
76
+ check=False,
77
+ )
78
+ print("✅ ffmpeg found at:", ffmpeg_path)
79
+ print(result.stdout.splitlines()[0]) # show first line of version info
80
+ except Exception as e:
81
+ raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")
82
+
83
+
84
+ class YTDLPError(RuntimeError):
85
+ pass
86
+
87
+ def _require(bin_name: str):
88
+ if shutil.which(bin_name) is None:
89
+ raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")
90
+
91
+ def download_youtube_audio_wav16k_api(
92
+ youtube_url: str,
93
+ out_dir: Optional[str] = None,
94
+ target_sr: int = 16000,
95
+ target_channels: int = 1,
96
+ quiet: bool = True,
97
+ keep_intermediate: bool = False,
98
+ progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
99
+ ) -> str:
100
+ """
101
+ Download YouTube audio via yt_dlp's Python API, extract to WAV,
102
+ and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.
103
+
104
+ Args
105
+ ----
106
+ youtube_url : str
107
+ out_dir : Optional[str] Directory for outputs (temp dir if None).
108
+ target_sr : int Sample rate for final WAV (default 16000).
109
+ target_channels : int Channels for final WAV (default 1 = mono).
110
+ quiet : bool Suppress yt-dlp logs if True.
111
+ keep_intermediate : bool Keep the pre-downsampled WAV if True.
112
+ progress_hook : callable Optional yt-dlp progress hook.
113
+
114
+ Raises
115
+ ------
116
+ YTDLPError on failure.
117
+ """
118
+ if not youtube_url or not isinstance(youtube_url, str):
119
+ raise ValueError("youtube_url must be a non-empty string.")
120
+
121
+ _require("ffmpeg") # we call ffmpeg ourselves
122
+ # yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly
123
+
124
+ work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
125
+ work_dir.mkdir(parents=True, exist_ok=True)
126
+
127
+ # First stage: let yt-dlp extract WAV (whatever SR/channels)
128
+ out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
129
+ hooks = [progress_hook] if progress_hook else []
130
+
131
+ ydl_opts = {
132
+ "format": "bestaudio/best",
133
+ "outtmpl": out_template,
134
+ "noplaylist": True,
135
+ "postprocessors": [
136
+ {
137
+ "key": "FFmpegExtractAudio",
138
+ "preferredcodec": "wav",
139
+ "preferredquality": "0",
140
+ }
141
+ ],
142
+ "quiet": quiet,
143
+ "no_warnings": quiet,
144
+ "progress_hooks": hooks,
145
+ }
146
+
147
+ try:
148
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
149
+ ydl.extract_info(youtube_url, download=True)
150
+ except Exception as e:
151
+ #raise YTDLPError(f"yt-dlp API failed: {e}") from e
152
+ return f"yt-dlp API failed: {e}"
153
+
154
+ # Locate the produced WAV (pre-downsampled)
155
+ pre_wavs = list(work_dir.glob("*.wav"))
156
+ if not pre_wavs:
157
+ #raise YTDLPError("yt-dlp completed but no WAV was found.")
158
+ return "yt-dlp completed but no WAV was found."
159
+ pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)
160
+
161
+ # Second stage: force 16 kHz mono via ffmpeg
162
+ final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
163
+ try:
164
+ subprocess.run(
165
+ [
166
+ "ffmpeg", "-y",
167
+ "-i", str(pre_wav),
168
+ "-ac", str(target_channels),
169
+ "-ar", str(target_sr),
170
+ str(final_wav),
171
+ ],
172
+ check=True,
173
+ stdout=subprocess.PIPE if quiet else None,
174
+ stderr=subprocess.PIPE if quiet else None,
175
+ text=True,
176
+ )
177
+ except subprocess.CalledProcessError as e:
178
+ #raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
179
+ return f"ffmpeg failed to resample: {e.stderr or e.stdout}"
180
+
181
+ # Clean up intermediates if desired
182
+ if not keep_intermediate:
183
+ try:
184
+ if pre_wav.exists() and pre_wav != final_wav:
185
+ pre_wav.unlink()
186
+ except Exception:
187
+ pass
188
+
189
+ return str(final_wav)
190
+
191
+
192
+ def transcribe_faster_whisper(wav_path:str, model_name="base.en"):
193
+ try:
194
+ model = WhisperModel(model_name)
195
+ segments, info = model.transcribe(wav_path, beam_size=1, vad_filter=True)
196
+ out = []
197
+ for s in segments:
198
+ out.append({"start": s.start, "end": s.end, "text": s.text})
199
+ #return {"language": info.language, "segments": out}
200
+ return {"segments": out}
201
+ except Exception as e:
202
+ return f"Faster-Whisper transcription failed: {e}"
203
+
204
+ def summarize_with_phi(transcript_segments, sysprompt, userprompt, phi_client):
205
+ # map-reduce pseudo:
206
+ CHUNK_SEC = 600 # ~10min per chunk as a starting point
207
+ chunks, cur, cur_t = [], [], 0.0
208
+ for seg in transcript_segments:
209
+ cur.append(seg); cur_t += (seg["end"]-seg["start"])
210
+ if cur_t >= CHUNK_SEC:
211
+ chunks.append(cur); cur, cur_t = [], 0.0
212
+ if cur: chunks.append(cur)
213
+
214
+ partials = []
215
+ for idx, chunk in enumerate(chunks, 1):
216
+ text = "\n".join(f"[{int(s['start']//60):02d}:{int(s['start']%60):02d}] {s['text']}" for s in chunk)
217
+ prompt = f"{userprompt}\n\nTRANSCRIPT CHUNK {idx}:\n{text}\n\nReturn: bullet summary + key timestamps."
218
+ partials.append(phi_client.summarize(sysprompt, prompt)) # your existing call
219
+
220
+ merged_prompt = f"Merge the {len(partials)} chunk summaries into one concise summary + top 5 timestamps."
221
+ return phi_client.summarize(sysprompt, merged_prompt + "\n\n" + "\n\n".join(partials))
222
+
223
+ if __name__ == "__main__":
224
+ main(url=None) # for local testing
__pycache__/Youtubetranscription_summarizer.cpython-313.pyc ADDED
Binary file (10.4 kB). View file
 
__pycache__/app.cpython-313.pyc ADDED
Binary file (14.1 kB). View file
 
app.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import tempfile
4
+ import requests
5
+ from datetime import datetime
6
+ import gradio as gr
7
+ from dotenv import load_dotenv
8
+ from openai import AzureOpenAI # official OpenAI SDK, works with Azure endpoints
9
+ import json
10
+ import subprocess
11
+ import Youtubetranscription_summarizer
12
+ from extract.app.Youtubeextraction import extract # Youtube download helper functions
13
+ #from pydantic import BaseModel, AnyUrl # Pydantic models for request validation in yiutube extraction
14
+ #from fastapi import FastAPI, HTTPException # FastAPI for building the API
15
+ #app = FastAPI() ## Initialize FastAPI app for testing in local
16
+ #from extractor.app.storage import upload_and_sign # Youtube storage helper functions
17
+ import re
18
+
19
+ # --- LLM call (Azure OpenAI with API key) -----------------------------------
20
+
21
+ def summarize_input(audio_b64: str = None, text_input: str = None, sys_prompt: str = None, user_prompt: str = None, Starttime: datetime = None) -> str:
22
+ """
23
+ Calls Azure OpenAI Chat Completions with audio input (base64 mp3) or text input, or both.
24
+ """
25
+ load_dotenv()
26
+
27
+ endpoint = os.getenv("AC_OPENAI_ENDPOINT")
28
+ api_key = os.getenv("AC_OPENAI_API_KEY")
29
+ deployment = os.getenv("AC_MODEL_DEPLOYMENT")
30
+ api_version = os.getenv("AC_OPENAI_API_VERSION")
31
+
32
+
33
+ if not endpoint or not api_key or not deployment:
34
+ return "Server misconfiguration: required env vars missing."
35
+ # Reset json_text for logging
36
+ json_text = ""
37
+ try:
38
+ client = AzureOpenAI(
39
+ api_key=api_key,
40
+ api_version=api_version,
41
+ azure_endpoint=endpoint,
42
+ )
43
+
44
+ system_message = sys_prompt.strip() if sys_prompt else (
45
+ "You are an AI assistant with a charter to clearly analyze the customer enquiry."
46
+ )
47
+ user_text = user_prompt.strip() if user_prompt else (
48
+ "Summarize the provided content." if audio_b64 or text_input else "No input provided."
49
+ )
50
+
51
+ content = [{"type": "text", "text": user_text}]
52
+
53
+ if audio_b64:
54
+ content.append({
55
+ "type": "input_audio",
56
+ "input_audio": {"data": audio_b64, "format": "mp3"},
57
+ })
58
+ if text_input is not None:
59
+ # Debugging: Print the type and value of text_input
60
+ #print(f"Debug: text_input type={type(text_input)}, value={text_input}")
61
+ if isinstance(text_input, str):
62
+ try:
63
+ # Try to parse the string as JSON to see if it's a list or dict
64
+ parsed = json.loads(text_input)
65
+ if isinstance(parsed, (list, dict)):
66
+ # If it's a list or dict, convert back to JSON string
67
+ content.append({"type": "text", "text": json.dumps(parsed)})
68
+ else:
69
+ # If it's a string but not a JSON list/dict, use it as-is
70
+ content.append({"type": "text", "text": text_input})
71
+ except json.JSONDecodeError:
72
+ # If it's not valid JSON, treat it as a regular string
73
+ content.append({"type": "text", "text": text_input})
74
+ elif isinstance(text_input, (list, dict)):
75
+ try:
76
+ # Convert list or dict to JSON-formatted string
77
+ json_text = json.dumps(text_input)
78
+ content.append({"type": "text", "text": json_text})
79
+ except (TypeError, ValueError):
80
+ return "Error: text_input (list or dict) could not be converted to JSON."
81
+ else:
82
+ return f"Error: text_input must be a string, list, or dict, got {type(text_input)}."
83
+
84
+ response = client.chat.completions.create(
85
+ model=deployment,
86
+ messages=[
87
+ {"role": "system", "content": system_message},
88
+ {"role": "user", "content": content},
89
+ ],
90
+ )
91
+ Enddate = datetime.now()
92
+ Callduration = Enddate - Starttime[0]
93
+ print(f"AudioChatSummarizer API call with a duration of {Callduration}: prompt_length={len(user_prompt or '')}, "
94
+ f"audio_size={len(audio_b64 or '')}, text_input_size={len(json_text or '')}")
95
+ return response.choices[0].message.content
96
+
97
+ except Exception as ex:
98
+ return print(f"Error from Azure OpenAI: {ex}")
99
+
100
+ #----Retrieve meta data from metadata.json file------------------------------
101
+ def retrieve_file_path(file_name):
102
+ path = os.path.dirname(os.path.abspath(__file__))
103
+ file_path = os.path.join(path, file_name)
104
+ if os.path.isfile(file_path):
105
+ return file_path
106
+ elif not os.path.exists(file_path):
107
+ print(f"'{file_path}' does not exist.")
108
+ return None
109
+ return None
110
+
111
+ def retrieve_json_record(file_path, record_id):
112
+ with open(file_path, 'r') as file:
113
+ data = json.load(file)
114
+ if isinstance(data, list):
115
+ for record in data:
116
+ if record.get('metadata', {}).get('id') == record_id:
117
+ return record
118
+ elif isinstance(data, dict):
119
+ if data.get('metadata', {}).get('id') == record_id:
120
+ return data
121
+ return None
122
+ # --- I/O helpers ------------------------------------------------------------
123
+
124
+ def encode_audio_from_path(path: str) -> str:
125
+ with open(path, "rb") as f:
126
+ return base64.b64encode(f.read()).decode("utf-8")
127
+
128
+
129
+ def download_to_temp_mp3(url: str) -> str:
130
+ r = requests.get(url, stream=True, timeout=30)
131
+ r.raise_for_status()
132
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
133
+ for chunk in r.iter_content(chunk_size=8192):
134
+ if chunk:
135
+ tmp.write(chunk)
136
+ return tmp.name
137
+
138
+ # function to read files
139
+ def file_read(filepath):
140
+ file_data = []
141
+
142
+ try:
143
+ with open(filepath, "rb") as f:
144
+ file_data = f.read()
145
+
146
+ print(f"Successfully validated {file_path} and read {len(file_data)} bytes.")
147
+ except Exception as e:
148
+ print(f"Could not read {file_path}: {e}")
149
+
150
+ return file_data
151
+
152
+ ###Download youtube video and extract audio using yt-dlp and ffmpeg
153
+ #### Fixing code to resolve 404 error
154
+
155
+ def fetch_audio_from_youtube(youtube_url: str) -> str:
156
+ """
157
+ Calls the extractor service and returns the signed audio URL.
158
+ - Tries POST /extract with youtube_url as a query param (your current server shape).
159
+ - Falls back to sending youtube_url in JSON body if needed.
160
+ - Accepts either JSON {"audio_url": "..."} or a plain string URL.
161
+ """
162
+ EXTRACT_API = os.getenv("AZURE_CONTAINER_APP_FQDN") ## Fast API endpoint for youtube extraction "https://<your-app-fqdn>/extract"
163
+ print(f"Extract_API value: {EXTRACT_API}")
164
+ base = EXTRACT_API.rstrip("/")
165
+ endpoint = base if base.endswith("/extract") else f"{base}/extract"
166
+
167
+ payload = {"format": "wav", "sample_rate": 16000, "mono": True}
168
+ timeout = 90
169
+
170
+ try:
171
+ # 1) Preferred: youtube_url as QUERY PARAM (matches your current API)
172
+ r = requests.post(endpoint, params={"youtube_url": youtube_url},
173
+ json=payload, timeout=timeout)
174
+ if r.status_code == 404 or r.status_code == 422:
175
+ # 2) Fallback: youtube_url in JSON body (if your API switches later)
176
+ body = {"youtube_url": youtube_url, **payload}
177
+ r = requests.post(endpoint, json=body, timeout=timeout)
178
+
179
+ if r.status_code >= 400:
180
+ # log details instead of raising blindly
181
+ print("STATUS:", r.status_code)
182
+ print("HEADERS:", r.headers)
183
+ print("BODY:", r.text[:2000])
184
+ r.raise_for_status()
185
+
186
+ # Response parsing: support dict or plain string
187
+ ctype = r.headers.get("Content-Type", "")
188
+ if "application/json" in ctype:
189
+ data = r.json()
190
+ # If server validates response_model to dict
191
+ if isinstance(data, dict) and "audio_url" in data:
192
+ return data["audio_url"]
193
+ # If server returns plain string in JSON (rare)
194
+ if isinstance(data, str):
195
+ return data
196
+ raise ValueError(f"Unexpected JSON shape: {data}")
197
+ else:
198
+ # Plain text URL response_model=str
199
+ text = r.text.strip()
200
+ if text.startswith("http"):
201
+ return text
202
+ raise ValueError(f"Unexpected text response: {text[:200]}")
203
+
204
+ except Exception as e:
205
+ msg = (f"{datetime.now()}: Error retrieving youtube wave file from Azure instance. "
206
+ f"url={youtube_url} endpoint={endpoint} err={e}")
207
+ print(msg)
208
+ return msg
209
+
210
+
211
+ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
212
+ tmp_to_cleanup = []
213
+ text_input = None
214
+ domaincheck = None
215
+ extract_input = None
216
+ audio_wav = None
217
+
218
+ try:
219
+ # Capture start time for logging
220
+ Starttime = datetime.now(),
221
+ print(f"AudioChatSummarizer API call starts at {datetime.now()}"),
222
+ audio_path = None
223
+ if upload_path:
224
+ audio_path = upload_path
225
+ elif record_path:
226
+ audio_path = record_path
227
+ elif url and url.strip():
228
+ # Check dns resolution of the url domain
229
+ domain = Youtubetranscription_summarizer.extract_domain(url)
230
+ if domain:
231
+ domaincheck = Youtubetranscription_summarizer.nslookup(domain) # Check DNS resolution of the domain
232
+ else:
233
+ return "Invalid URL format."
234
+
235
+ if domaincheck:
236
+ # Check if the url is a youtube link
237
+ CheckURL = re.search(r"Youtube", url, re.IGNORECASE)
238
+
239
+ if CheckURL:
240
+ # Get the transcription from youtube
241
+ # text_input = Youtubetranscription_summarizer.main(url.strip()) # Youtube files are transcribed and summarized
242
+ #extract_input = extract(url.strip()) # Call for local testing
243
+ # Test wav file transcription using faster-whisper # Call for local testing
244
+ #audio_wav = fetch_audio_from_youtube(extract_input) # Call for local testing
245
+ audio_wav = fetch_audio_from_youtube(url.strip()) # Server API call
246
+ #file_path = "/Users/sayedarizvi/AudioSummarizer/Data/test.wav" # Call for local testing
247
+ #audio_wav = file_path # Call for local testing
248
+ #text_input = Youtubetranscription_summarizer.transcribe_faster_whisper(extract_input, model_name="base.en")# Call for local testing
249
+ text_input = Youtubetranscription_summarizer.transcribe_faster_whisper(audio_wav, model_name="base.en") #Call for server testing
250
+ tmp_to_cleanup.append(text_input)
251
+ else:
252
+ audio_path = download_to_temp_mp3(url.strip())
253
+ tmp_to_cleanup.append(audio_path)
254
+ else:
255
+ return f"DNS lookup failed for {domain}"
256
+ if not audio_path and text_input is None:
257
+ return "Please provide content via upload, recording, or URL."
258
+ # Transcribe audio to text via faster-whisper before sending to gpt-4o-mini
259
+ # (gpt-4o-mini only accepts text/image_url content blocks, not audio)
260
+ if audio_path:
261
+ text_input = Youtubetranscription_summarizer.transcribe_faster_whisper(audio_path, model_name="base.en")
262
+ return summarize_input(None, text_input, sys_prompt, user_prompt, Starttime)
263
+
264
+ except Exception as e:
265
+ return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
266
+
267
+
268
+ finally:
269
+ for p in tmp_to_cleanup:
270
+ try:
271
+ if os.path.exists(p):
272
+ os.remove(p)
273
+ except Exception:
274
+ pass
275
+
276
+
277
+ # --- UI ---------------------------------------------------------------------
278
+
279
+ with gr.Blocks(title="Audio Summarizer") as demo:
280
+ gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
281
+ gr.Markdown("Upload an mp3(**YouTube is the new feature add**), record audio, or paste a URL, use the default user prompt and system prompt and click 'Summarize'.")
282
+ gr.Markdown("Users are encouraged to modify the user and system prompts to suit their needs.")
283
+ gr.Markdown("**Responsible Use**: This project is for educational and research purposes only. It does not intend to violate copyright, YouTube’s Terms of Service, or data rights. Users are responsible for ensuring compliance with applicable laws and platform policies when processing audio or video content. AudioSummarizer is designed as a learning tool to explore AI summarization workflows, not as a commercial service.")
284
+ with gr.Row():
285
+ with gr.Column():
286
+ upload_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload mp3")
287
+ with gr.Column():
288
+ record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
289
+ with gr.Column():
290
+ url_input = gr.Textbox(label="YouTube or standard mp3 URL", placeholder="https://example.com/audio.mp3")
291
+
292
+ ### Get system and user prompts from metadata.json file
293
+ file_name = 'metadata.json'
294
+ record_id = '1'
295
+ file_path = retrieve_file_path(file_name)
296
+
297
+ jsonrecord = retrieve_json_record(file_path, record_id)
298
+ if jsonrecord:
299
+ print(json.dumps(jsonrecord, indent=2))
300
+ else:
301
+ print("Record not found.")
302
+
303
+ sysprompt_default = jsonrecord['metadata']['content']['system_prompt']['content']
304
+ userprompt_default = jsonrecord['metadata']['content']['user_prompt']['content']
305
+
306
+ with gr.Row():
307
+ userprompt_input = gr.Textbox(
308
+ label="User Prompt",
309
+ #value="Summarize the audio content",
310
+ value=userprompt_default,
311
+ placeholder="e.g., Extract key points and action items",
312
+ )
313
+ sysprompt_input = gr.Textbox(
314
+ label="System Prompt",
315
+ #value="You are an AI assistant with a charter to clearly analyze the customer enquiry.",
316
+ value=sysprompt_default,
317
+ )
318
+
319
+ submit_btn = gr.Button("Summarize")
320
+ output = gr.Textbox(label="Summary", lines=12)
321
+
322
+ # Capture inputs for logging
323
+ if upload_audio:
324
+ upload_audio.change(
325
+ fn=lambda x: print(f"Upload audio selected: {x}"),
326
+ inputs=[upload_audio],
327
+ outputs=[],
328
+ # Reset other inputs to avoid confusion
329
+ )
330
+ if record_audio:
331
+ record_audio.change(
332
+ fn=lambda x: print(f"Record audio selected: {x}"),
333
+ inputs=[record_audio],
334
+ outputs=[],
335
+ )
336
+ if url_input:
337
+ url_input.change(
338
+ fn=lambda x: print(f"URL input changed: {x}"),
339
+ inputs=[url_input],
340
+ outputs=[],
341
+ )
342
+ submit_btn.click(
343
+ fn=process_audio,
344
+ inputs=[upload_audio, record_audio, url_input, sysprompt_input, userprompt_input],
345
+ outputs=output,
346
+
347
+
348
+ )
349
+
350
+
351
+ if __name__ == "__main__":
352
+ demo.launch()
app_v1.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import tempfile
4
+ import requests
5
+ from datetime import datetime
6
+ import gradio as gr
7
+ from dotenv import load_dotenv
8
+ from openai import AzureOpenAI # official OpenAI SDK, works with Azure endpoints
9
+ import json
10
+ import subprocess # to execute youtube-dl version
11
+ import Youtubetranscription_summarizer
12
+
13
+ # --- LLM call (Azure OpenAI with API key) -----------------------------------
14
+
15
+ def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> str:
16
+ """
17
+ Calls Azure OpenAI Chat Completions with audio input (base64 mp3).
18
+ """
19
+ load_dotenv()
20
+
21
+ endpoint = os.getenv("AC_OPENAI_ENDPOINT")
22
+ api_key = os.getenv("AC_OPENAI_API_KEY")
23
+ deployment = os.getenv("AC_MODEL_DEPLOYMENT")
24
+ api_version = os.getenv("AC_OPENAI_API_VERSION")
25
+
26
+ if not endpoint or not api_key or not deployment:
27
+ return "Server misconfiguration: required env vars missing."
28
+
29
+
30
+ try:
31
+ client = AzureOpenAI(
32
+ api_key=api_key,
33
+ api_version=api_version,
34
+ azure_endpoint=endpoint,
35
+ )
36
+
37
+ system_message = sys_prompt.strip() if sys_prompt else (
38
+ "You are an AI assistant with a charter to clearly analyze the customer enquiry."
39
+ )
40
+ user_text = user_prompt.strip() if user_prompt else "Summarize the audio content."
41
+
42
+ response = client.chat.completions.create(
43
+ model=deployment,
44
+ messages=[
45
+ {"role": "system", "content": system_message},
46
+ {
47
+ "role": "user",
48
+ "content": [
49
+ {"type": "text", "text": user_text},
50
+ {
51
+ "type": "input_audio",
52
+ #"input_audio": {"data": audio_b64, "format": "mp3"},
53
+ "input_audio": {"data": audio_b64, "format": "wav"},
54
+ },
55
+ ],
56
+ },
57
+ ],
58
+ )
59
+ print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt)}, audio_size={len(audio_b64)}")
60
+ return response.choices[0].message.content
61
+
62
+ except Exception as ex:
63
+ return print(f"Error from Azure OpenAI: {ex}")
64
+ #pass
65
+
66
+ #----Retrieve meta data from metadata.json file------------------------------
67
+ def retrieve_file_path(file_name):
68
+ path = os.path.dirname(os.path.abspath(__file__))
69
+ file_path = os.path.join(path, file_name)
70
+ if os.path.isfile(file_path):
71
+ return file_path
72
+ elif not os.path.exists(file_path):
73
+ print(f"'{file_path}' does not exist.")
74
+ return None
75
+ return None
76
+
77
+ def retrieve_json_record(file_path, record_id):
78
+ with open(file_path, 'r') as file:
79
+ data = json.load(file)
80
+ if isinstance(data, list):
81
+ for record in data:
82
+ if record.get('metadata', {}).get('id') == record_id:
83
+ return record
84
+ elif isinstance(data, dict):
85
+ if data.get('metadata', {}).get('id') == record_id:
86
+ return data
87
+ return None
88
+ # --- I/O helpers ------------------------------------------------------------
89
+
90
+ def encode_audio_from_path(path: str) -> str:
91
+ with open(path, "rb") as f:
92
+ return base64.b64encode(f.read()).decode("utf-8")
93
+
94
+
95
+ def download_to_temp_mp3(url: str) -> str:
96
+ r = requests.get(url, stream=True, timeout=30)
97
+ r.raise_for_status()
98
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
99
+ for chunk in r.iter_content(chunk_size=8192):
100
+ if chunk:
101
+ tmp.write(chunk)
102
+ return tmp.name
103
+
104
+
105
+ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
106
+ tmp_to_cleanup = []
107
+ try:
108
+ audio_path = None
109
+ if upload_path:
110
+ audio_path = upload_path
111
+ elif record_path:
112
+ audio_path = record_path
113
+ elif url and url.strip():
114
+ #audio_path = download_to_temp_mp3(url.strip())
115
+ audio_path = Youtubetranscription_summarizer.main(url.strip())
116
+ tmp_to_cleanup.append(audio_path)
117
+
118
+ if not audio_path:
119
+ return "Please provide an audio file via upload, recording, or URL."
120
+
121
+ audio_b64 = encode_audio_from_path(audio_path)
122
+ return summarize_audio_b64(audio_b64, sys_prompt, user_prompt)
123
+
124
+ except Exception as e:
125
+ return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
126
+
127
+
128
+ finally:
129
+ for p in tmp_to_cleanup:
130
+ try:
131
+ if os.path.exists(p):
132
+ os.remove(p)
133
+ except Exception:
134
+ pass
135
+
136
+
137
+ # --- UI ---------------------------------------------------------------------
138
+
139
+ with gr.Blocks(title="Audio Summarizer") as demo:
140
+ gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
141
+ gr.Markdown("Upload a mp3, record audio, or paste a URL. The app sends base64 audio to Azure OpenAI.")
142
+
143
+ with gr.Row():
144
+ with gr.Column():
145
+ upload_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload mp3")
146
+ with gr.Column():
147
+ record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
148
+ with gr.Column():
149
+ url_input = gr.Textbox(label="mp3 URL", placeholder="https://example.com/audio.mp3")
150
+
151
+ ### Get system and user prompts from metadata.json file
152
+ file_name = 'metadata.json'
153
+ record_id = '1'
154
+ file_path = retrieve_file_path(file_name)
155
+
156
+ jsonrecord = retrieve_json_record(file_path, record_id)
157
+ if jsonrecord:
158
+ print(json.dumps(jsonrecord, indent=2))
159
+ else:
160
+ print("Record not found.")
161
+
162
+ sysprompt_default = jsonrecord['metadata']['content']['system_prompt']['content']
163
+ userprompt_default = jsonrecord['metadata']['content']['user_prompt']['content']
164
+
165
+ with gr.Row():
166
+ userprompt_input = gr.Textbox(
167
+ label="User Prompt",
168
+ #value="Summarize the audio content",
169
+ value=userprompt_default,
170
+ placeholder="e.g., Extract key points and action items",
171
+ )
172
+ sysprompt_input = gr.Textbox(
173
+ label="System Prompt",
174
+ #value="You are an AI assistant with a charter to clearly analyze the customer enquiry.",
175
+ value=sysprompt_default,
176
+ )
177
+
178
+ submit_btn = gr.Button("Summarize")
179
+ output = gr.Textbox(label="Summary", lines=12)
180
+
181
+ # Capture inputs for logging
182
+ if upload_audio:
183
+ upload_audio.change(
184
+ fn=lambda x: print(f"Upload audio selected: {x}"),
185
+ inputs=[upload_audio],
186
+ outputs=[],
187
+ # Reset other inputs to avoid confusion
188
+ )
189
+ if record_audio:
190
+ record_audio.change(
191
+ fn=lambda x: print(f"Record audio selected: {x}"),
192
+ inputs=[record_audio],
193
+ outputs=[],
194
+ )
195
+ if url_input:
196
+ url_input.change(
197
+ fn=lambda x: print(f"URL input changed: {x}"),
198
+ inputs=[url_input],
199
+ outputs=[],
200
+ )
201
+ submit_btn.click(
202
+ fn=process_audio,
203
+ inputs=[upload_audio, record_audio, url_input, sysprompt_input, userprompt_input],
204
+ outputs=output,
205
+ )
206
+
207
+ if __name__ == "__main__":
208
+ demo.launch()
extract/.DS_Store ADDED
Binary file (6.15 kB). View file
 
extract/Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---------- Base ----------
2
+ #FROM python:3.11-slim
3
+ FROM cab337fa40e5acr.azurecr.io/python:3.11-slim
4
+
5
+ # ---------- System deps ----------
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ ffmpeg ca-certificates curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # ---------- Workdir ----------
11
+ WORKDIR /workspace
12
+
13
+ # ---------- Python deps ----------
14
+ # requirements.txt is at AUDIOSUMMARIZER/extract/requirements.txt
15
+
16
+ COPY app/requirements.txt .
17
+ RUN pip install --no-cache-dir --upgrade pip \
18
+ && pip install --no-cache-dir -r requirements.txt
19
+
20
+ # ---------- App code ----------
21
+ # Copy EVERYTHING under AUDIOSUMMARIZER/extract (includes subfolders: app/ and utils/)
22
+ COPY . /workspace/extract
23
+
24
+ # Make /workspace importable so "extract.app.Youtubeextraction" & "app.utils..." work
25
+ ENV PYTHONPATH=/workspace
26
+
27
+ # Runtime env (override at deploy)
28
+ ENV HOST=0.0.0.0
29
+ ENV PORT=8080
30
+ ENV AZURE_STORAGE_ACCOUNT=__SET_AT_DEPLOY__
31
+ ENV AZURE_BLOB_CONTAINER=__SET_AT_DEPLOY__
32
+ ENV COOKIES_ACCOUNT=__SET_AT_DEPLOY__
33
+ ENV COOKIES_CONTAINER=__SET_AT_DEPLOY__
34
+ ENV COOKIES_BLOB=__SET_AT_DEPLOY__
35
+ ENV COOKIES_PATH=__SET_AT_DEPLOY__
36
+ ENV COOKIES_REFRESH_SEC=__SET_AT_DEPLOY__
37
+
38
+ EXPOSE 8080
39
+
40
+ # Your ASGI app is defined in extract/app/Youtubeextraction.py as `app = FastAPI()`
41
+ CMD ["uvicorn", "extract.app.Youtubeextraction:app", "--host", "0.0.0.0", "--port", "8080"]
extract/__init__.py ADDED
File without changes
extract/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (153 Bytes). View file
 
extract/app/.DS_Store ADDED
Binary file (6.15 kB). View file
 
extract/app/Youtubeextraction.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, tempfile, subprocess, re, json, shutil, time
2
+ from fastapi import FastAPI, HTTPException
3
+ from pathlib import Path
4
+ from typing import Optional, Callable, Any
5
+ import yt_dlp
6
+ # from utils.storage import upload_and_sign # To remove circular import issue
7
+ from extract.utils.storage import upload_and_sign # To remove circular import issue
8
+ from extract.utils.retrieve_filepath import retrieve_file_path # To get the file path of cookies.txt
9
+ from extract.utils.cookies_refresher import start_cookies_refresher # To refresh cookies.txt periodically
10
+
11
+ app = FastAPI()
12
+
13
+ def ensure_ffmpeg():
14
+ """
15
+ Verify that ffmpeg is available in PATH.
16
+ Raises RuntimeError with helpful guidance if missing.
17
+ Prints ffmpeg version to logs if found.
18
+ """
19
+ ffmpeg_path = shutil.which("ffmpeg")
20
+ if ffmpeg_path is None:
21
+ raise RuntimeError(
22
+ "FFmpeg not found in PATH.\n\n"
23
+ "👉 For Hugging Face Spaces:\n"
24
+ " • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
25
+ " • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
26
+ "Without ffmpeg, yt-dlp cannot extract/convert audio."
27
+ )
28
+
29
+ try:
30
+ result = subprocess.run(
31
+ ["ffmpeg", "-version"],
32
+ stdout=subprocess.PIPE,
33
+ stderr=subprocess.STDOUT,
34
+ text=True,
35
+ check=False,
36
+ )
37
+ print("✅ ffmpeg found at:", ffmpeg_path)
38
+ print(result.stdout.splitlines()[0]) # show first line of version info
39
+ except Exception as e:
40
+ raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")
41
+
42
+ class YTDLPError(RuntimeError):
43
+ pass
44
+
45
+ def _require(bin_name: str):
46
+ if shutil.which(bin_name) is None:
47
+ raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")
48
+
49
+
50
+ @app.get("/health")
51
+ def health():
52
+ return {"ok": True}
53
+
54
+ @app.post("/extract")
55
+ def extract(
56
+ youtube_url: str,
57
+ out_dir: Optional[str] = None,
58
+ target_sr: int = 16000,
59
+ target_channels: int = 1,
60
+ quiet: bool = True,
61
+ keep_intermediate: bool = False,
62
+ progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
63
+ ) -> str:
64
+ """
65
+ Download YouTube audio via yt_dlp's Python API, extract to WAV,
66
+ and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.
67
+
68
+ Args
69
+ ----
70
+ youtube_url : str
71
+ out_dir : Optional[str] Directory for outputs (temp dir if None).
72
+ target_sr : int Sample rate for final WAV (default 16000).
73
+ target_channels : int Channels for final WAV (default 1 = mono).
74
+ quiet : bool Suppress yt-dlp logs if True.
75
+ keep_intermediate : bool Keep the pre-downsampled WAV if True.
76
+ progress_hook : callable Optional yt-dlp progress hook.
77
+
78
+ Raises
79
+ ------
80
+ YTDLPError on failure.
81
+ """
82
+ if not youtube_url or not isinstance(youtube_url, str):
83
+ raise ValueError("youtube_url must be a non-empty string.")
84
+
85
+ _require("ffmpeg") # we call ffmpeg ourselves
86
+ # yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly
87
+
88
+ work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
89
+ work_dir.mkdir(parents=True, exist_ok=True)
90
+
91
+ # First stage: let yt-dlp extract WAV (whatever SR/channels)
92
+ out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
93
+ hooks = [progress_hook] if progress_hook else []
94
+ ### Use cookies.txt if available
95
+ #cookies_path = retrieve_file_path("cookies.txt")
96
+ #cookies_path = "./app/utils/cookies.txt"
97
+ # Call the cookies refresher to start refreshing cookies in background
98
+ start_cookies_refresher()
99
+ cookies_path = os.getenv("COOKIES_PATH")
100
+ print(f"cookies_path value: {cookies_path}")
101
+ if not cookies_path:
102
+ cookies_path = None
103
+ print("Cookie file NOT found in container!")
104
+ return f"User authentication cookie file NOT found in container! Please try again later."
105
+
106
+ ydl_opts = {
107
+ "cookiefile": cookies_path,
108
+ "format": "bestaudio/best",
109
+ "outtmpl": out_template,
110
+ "noplaylist": True,
111
+ "postprocessors": [
112
+ {
113
+ "key": "FFmpegExtractAudio",
114
+ "preferredcodec": "wav",
115
+ "preferredquality": "0",
116
+ }
117
+ ],
118
+ "quiet": quiet,
119
+ "verbose": not quiet,
120
+ "no_warnings": quiet,
121
+ "progress_hooks": hooks,
122
+ }
123
+
124
+ try:
125
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
126
+ ydl.extract_info(youtube_url, download=True)
127
+ except Exception as e:
128
+ #raise YTDLPError(f"yt-dlp API failed: {e}") from e
129
+ return f"yt-dlp API failed: {e}"
130
+
131
+ # Locate the produced WAV (pre-downsampled)
132
+ pre_wavs = list(work_dir.glob("*.wav"))
133
+ if not pre_wavs:
134
+ #raise YTDLPError("yt-dlp completed but no WAV was found.")
135
+ return "yt-dlp completed but no WAV was found."
136
+ pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)
137
+
138
+ # Second stage: force 16 kHz mono via ffmpeg
139
+ final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
140
+ try:
141
+ subprocess.run(
142
+ [
143
+ "ffmpeg", "-y",
144
+ "-i", str(pre_wav),
145
+ "-ac", str(target_channels),
146
+ "-ar", str(target_sr),
147
+ str(final_wav),
148
+ ],
149
+ check=True,
150
+ stdout=subprocess.PIPE if quiet else None,
151
+ stderr=subprocess.PIPE if quiet else None,
152
+ text=True,
153
+ )
154
+ except subprocess.CalledProcessError as e:
155
+ #raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
156
+ return f"ffmpeg failed to resample: {e.stderr or e.stdout}"
157
+
158
+ # 3) upload + sign (short-lived)
159
+ signed = upload_and_sign(final_wav, ttl_minutes=45)
160
+
161
+ # Clean up intermediates if desired
162
+ if not keep_intermediate:
163
+ try:
164
+ if pre_wav.exists() and pre_wav != final_wav:
165
+ pre_wav.unlink()
166
+ except Exception:
167
+ pass
168
+
169
+ return signed
extract/app/__init__.py ADDED
File without changes
extract/app/__pycache__/Youtubeextraction.cpython-313.pyc ADDED
Binary file (7.25 kB). View file
 
extract/app/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (157 Bytes). View file
 
extract/app/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ dotenv==0.9.9
2
+ requests==2.32.5
3
+ azure-identity==1.25.0
4
+ yt_dlp==2025.9.23
5
+ fastapi
6
+ uvicorn[standard]==0.30.6
7
+ azure-storage-blob==12.20.0
extract/utils/__init__.py ADDED
File without changes
extract/utils/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (149 Bytes). View file
 
extract/utils/__pycache__/cookies_refresher.cpython-313.pyc ADDED
Binary file (4.27 kB). View file
 
extract/utils/__pycache__/retrieve_filepath.cpython-313.pyc ADDED
Binary file (890 Bytes). View file
 
extract/utils/__pycache__/storage.cpython-313.pyc ADDED
Binary file (2.76 kB). View file
 
extract/utils/cookies_refresher.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time, hashlib, tempfile, threading
2
+ from azure.identity import DefaultAzureCredential
3
+ from azure.storage.blob import BlobClient
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+ ACCOUNT = os.getenv("AZURE_STORAGE_ACCOUNT","yt-extractor-rg") # storage account name
8
+ CONTAINER= os.getenv("COOKIES_CONTAINER","cookies") # container name
9
+ BLOB = os.getenv("COOKIES_BLOB","cookies.txt") # blob name
10
+ OUT_PATH = os.getenv("COOKIES_PATH","/tmp/cookies.txt") # local path to write cookies
11
+ REFRESH = int(os.getenv("COOKIES_REFRESH_SEC", "600")) # Default to 10 minutes
12
+
13
+ def _sha256(b: bytes) -> str: return hashlib.sha256(b).hexdigest()
14
+ def _read(path: str) -> bytes:
15
+ try:
16
+ with open(path, "rb") as f: return f.read()
17
+ except: return b""
18
+
19
+ def _atomic_write(path: str, data: bytes):
20
+ d = os.path.dirname(path) or "."
21
+ os.makedirs(d, exist_ok=True)
22
+ fd, tmp = tempfile.mkstemp(prefix=".cookies.", dir=d)
23
+ with os.fdopen(fd, "wb") as f: f.write(data)
24
+ os.replace(tmp, path)
25
+ try: os.chmod(path, 0o600)
26
+ except: pass
27
+
28
+ def refresh_once():
29
+ if not ACCOUNT:
30
+ print("[cookies] ACCOUNT not set"); return
31
+ cred = DefaultAzureCredential() # uses ACA managed identity
32
+ bc = BlobClient(
33
+ account_url=f"https://{ACCOUNT}.blob.core.windows.net",
34
+ container_name=CONTAINER,
35
+ blob_name=BLOB,
36
+ credential=cred,
37
+ )
38
+ new = bc.download_blob(max_concurrency=1).readall()
39
+ if not new.strip():
40
+ print("[cookies] WARN: blob is empty; skipping")
41
+ return
42
+ if _sha256(new) != _sha256(_read(OUT_PATH)):
43
+ _atomic_write(OUT_PATH, new)
44
+ print(f"[cookies] updated -> {OUT_PATH} (bytes={len(new)})")
45
+
46
+ def start_cookies_refresher():
47
+ # initial fetch before serving traffic
48
+ try: refresh_once()
49
+ except Exception as e: print(f"[cookies] initial refresh error: {e}")
50
+ # periodic refresh
51
+ def loop():
52
+ while True:
53
+ time.sleep(REFRESH)
54
+ try: refresh_once()
55
+ except Exception as e: print(f"[cookies] refresh error: {e}")
56
+ threading.Thread(target=loop, daemon=True).start()
extract/utils/probeytdlp.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import yt_dlp, traceback, sys, os
3
+ from http.cookiejar import MozillaCookieJar
4
+
5
+ class YDLLogger:
6
+ def debug(self, msg): print("[DEBUG]", msg)
7
+ def warning(self, msg): print("[WARN]", msg)
8
+ def error(self, msg): print("[ERROR]", msg)
9
+
10
+ def probe(url, cookies=None):
11
+ ydl_opts = {
12
+ "format": "bestaudio/best",
13
+ "cachedir": False,
14
+ "logger": YDLLogger(),
15
+ "no_warnings": False,
16
+ "quiet": False,
17
+ # don't try postprocessing during probe
18
+ "postprocessors": [],
19
+ # helpful to mimic a browser if site is picky:
20
+ "http_headers": {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"},
21
+ }
22
+ if cookies:
23
+ ydl_opts["cookiefile"] = cookies
24
+
25
+ try:
26
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
27
+ print("Probing (no download)...")
28
+ info = ydl.extract_info(url, download=False)
29
+ print("Top-level keys in info:", list(info.keys()))
30
+ formats = info.get("formats")
31
+ if formats:
32
+ print("Found formats (count):", len(formats))
33
+ for f in formats[:10]:
34
+ print(f" - id={f.get('format_id')}, ext={f.get('ext')}, abr={f.get('abr')}, vbr={f.get('vbr')}, note={f.get('format_note')}")
35
+ else:
36
+ print("No formats found. Inspecting other info fields:")
37
+ for k in ("webpage_url", "extractor", "requested_formats", "is_live", "entries"):
38
+ print(f" {k}: {info.get(k)}")
39
+ return info
40
+ except Exception as e:
41
+ print("EXCEPTION during probe:")
42
+ traceback.print_exc()
43
+ # also dump any HTML/diagnostic text if available in exception text
44
+ print("Exception message:", str(e))
45
+
46
+ if __name__ == "__main__":
47
+ cookies = None
48
+ if len(sys.argv) > 1:
49
+ cookies = sys.argv[1]
50
+ if not os.path.isfile(cookies):
51
+ print(f"Cookie file '{cookies}' not found.")
52
+ sys.exit(1)
53
+ url = "https://www.youtube.com/watch?v=wDchsz8nmbo"
54
+ probe(url, cookies)
extract/utils/retrieve_filepath.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ def retrieve_file_path(file_name):
3
+ path = os.path.dirname(os.path.abspath(__file__))
4
+ file_path = os.path.join(path, file_name)
5
+ if os.path.isfile(file_path):
6
+ return file_path
7
+ elif not os.path.exists(file_path):
8
+ print(f"'{file_path}' does not exist.")
9
+ return None
10
+ return None
extract/utils/storage.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os, uuid
3
+ from datetime import datetime, timedelta, timezone
4
+ from azure.identity import ManagedIdentityCredential, DefaultAzureCredential
5
+ from azure.storage.blob import (
6
+ BlobServiceClient, generate_blob_sas, BlobSasPermissions
7
+ )
8
+
9
+ load_dotenv()
10
+ ACCOUNT_NAME = os.getenv("AZURE_STORAGE_ACCOUNT","ytstore7135")
11
+ CONTAINER = os.getenv("AZURE_BLOB_CONTAINER","audio")
12
+
13
+ # Use Managed Identity in Azure; locally DefaultAzureCredential also works
14
+ def _credential():
15
+ # Tries MI in Azure; falls back to developer creds locally
16
+ return DefaultAzureCredential(exclude_interactive_browser_credential=False)
17
+
18
+ def _svc_client():
19
+ url = f"https://{ACCOUNT_NAME}.blob.core.windows.net"
20
+ return BlobServiceClient(account_url=url, credential=_credential())
21
+
22
+ def upload_and_sign(local_path: str, ttl_minutes: int = 45) -> str:
23
+ svc = _svc_client()
24
+ name = f"{uuid.uuid4()}/{os.path.basename(local_path)}"
25
+ blob = svc.get_blob_client(container=CONTAINER, blob=name)
26
+ with open(local_path, "rb") as f:
27
+ blob.upload_blob(f, overwrite=True, content_type="audio/wav")
28
+
29
+ # Get User Delegation Key (no account key needed)
30
+ udk = svc.get_user_delegation_key(
31
+ key_start_time=datetime.now(timezone.utc) - timedelta(minutes=5),
32
+ key_expiry_time=datetime.now(timezone.utc) + timedelta(hours=2),
33
+ )
34
+ sas = generate_blob_sas(
35
+ account_name=ACCOUNT_NAME,
36
+ container_name=CONTAINER,
37
+ blob_name=name,
38
+ user_delegation_key=udk,
39
+ permission=BlobSasPermissions(read=True),
40
+ expiry=datetime.now(timezone.utc) + timedelta(minutes=ttl_minutes),
41
+ )
42
+ return f"{blob.url}?{sas}"
gradio_client_audichattranscriber.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import gradio as gr
3
+ from dotenv import load_dotenv
4
+ from gradio_client import Client # Gradio client for Hugging Face models
5
+
6
+ def main():
7
+ """
8
+ Calls Gradio app hosted on Hugging Face using Gradio client.
9
+ """
10
+ load_dotenv() # Load .env file for HF token if needed
11
+
12
+
13
+ try:
14
+ client = Client("samir72/AudioChatTranscriber") # Hugging Face model with Gradio app
15
+ #client.view_api() # View available API endpoints
16
+ response = client.predict(
17
+ upload_path=None,
18
+ record_path=None,
19
+ url="https://audio-samples.github.io/samples/mp3/blizzard_biased/sample-0.mp3",
20
+ sys_prompt="You are an AI assistant with a listening charter to clearly analyze the customer enquiry.",
21
+ user_prompt="Summarize the audio content",
22
+ api_name="/process_audio"
23
+ )
24
+ print(f"Gradio API call at {datetime.now()}")
25
+ print(f"Summarized Output : {response}")
26
+ return response
27
+
28
+ except Exception as ex:
29
+ return print(f"Error calling Gradio app: {ex}")
30
+ #pass
31
+
32
+
33
+
34
+ if __name__ == "__main__":
35
+ main()
metadata.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata":
3
+ {
4
+ "id": "1",
5
+ "timestamp": "2025-09-18T11:24:00Z",
6
+ "type": "prompt_metadata",
7
+ "content": {
8
+ "system_prompt": {
9
+ "title": "System Prompt",
10
+ "content": "You are a highly capable AI assistant designed to process and analyze multimodal inputs, including audio and text. Your task is to generate a structured response based on the provided input, following a specific format. When an image is uploaded along with a text prompt, analyze the image content and integrate it with the text to produce a coherent response. Use the following format for your response: - **Summary**: Provide a concise summary of the main points from the text and image (if applicable), limited to 2-3 sentences. - **Key Details**: List 3-5 key details extracted from the text and image, presented as bullet points. - **Insights**: Offer 1-2 insightful observations or conclusions based on the content, keeping it brief. Ensure your response is accurate, relevant, and tailored to the input. If the input lacks sufficient information, acknowledge the limitation and provide a general response based on the available data.",
11
+ "version": "1.0",
12
+ "created_at": "2025-09-18T1:24:00Z"
13
+ },
14
+ "user_prompt": {
15
+ "title": "User Prompt",
16
+ "content": "Analyze the audio file to provide a structured response. Make sure to include a summary, key details, and insights based on the combined information.",
17
+ "version": "1.0",
18
+ "created_at": "2025-09-18T1:25:00Z"
19
+ }
20
+ },
21
+ "tags": ["AI", "multimodal", "podcast", "AWS"],
22
+ "status": "active"
23
+ },
24
+ "metadata_2":
25
+ {
26
+ "id": "2",
27
+ "timestamp": "2025-09-18T11:24:00Z",
28
+ "type": "prompt_metadata",
29
+ "content": {
30
+ "system_prompt": {
31
+ "title": "System Prompt",
32
+ "content": "You are a highly capable AI assistant designed to process and analyze multimodal inputs, including text and images. Your task is to generate a structured response based on the provided input, following a specific format. When an image is uploaded along with a text prompt, analyze the image content and integrate it with the text to produce a coherent response. Use the following format for your response: - **Summary**: Provide a concise summary of the main points from the text and image (if applicable), limited to 2-3 sentences. - **Key Details**: List 3-5 key details extracted from the text and image, presented as bullet points. - **Insights**: Offer 1-2 insightful observations or conclusions based on the content, keeping it brief. Ensure your response is accurate, relevant, and tailored to the input. If the input lacks sufficient information, acknowledge the limitation and provide a general response based on the available data.",
33
+ "version": "1.0",
34
+ "created_at": "2025-09-18T11:00:00Z"
35
+ },
36
+ "user_prompt": {
37
+ "title": "User Prompt",
38
+ "content": "Analyze the attached image and the following text to provide a structured response. Make sure to include a summary, key details, and insights based on the combined information.",
39
+ "version": "1.0",
40
+ "created_at": "2025-09-18T11:10:00Z"
41
+ }
42
+ },
43
+ "tags": ["AI", "multimodal", "podcast", "AWS"],
44
+ "status": "active"
45
+ }
46
+ }
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dotenv==0.9.9
2
+ gradio==5.46.1
3
+ requests==2.32.5
4
+ azure-identity==1.25.0
5
+ azure-ai-projects==1.0.0
6
+ numpy==1.26.4
7
+ openai==1.107.3
8
+ yt_dlp==2025.9.23
9
+ faster_whisper==1.2.0
10
+ fastapi
11
+ uvicorn[standard]==0.30.6
12
+ azure-storage-blob==12.20.0