Spaces:

Agents-MCP-Hackathon
/

ClipScript

Running

App Files Files Community

ClipScript / app.py

muzzz

fix

594b1dc about 1 month ago

raw

history blame contribute delete

15.1 kB

	from functools import wraps
	import logging
	import gradio as gr
	import os
	import modal
	from openai import OpenAI
	from dotenv import load_dotenv
	import re
	import time
	import uuid
	import yt_dlp
	import tempfile
	import shutil
	from pathlib import Path

	load_dotenv()


	process_media_remotely = modal.Function.from_name("clipscript-processing-service", "process_media")
	asr_handle = modal.Cls.from_name("clipscript-asr-service", "ASR")
	upload_volume = modal.Volume.from_name("clipscript-uploads", create_if_missing=True)


	llm = "deepseek/deepseek-r1-0528:free"
	api_key = os.environ.get("OPENROUTER_API_KEY")


	def retry_on_rate_limit(max_retries: int = 5, base_delay: float = 2.0):
	"""Decorator for exponential backoff on rate limits"""
	def decorator(func):
	@wraps(func)
	def wrapper(args, *kwargs):
	delay = base_delay
	for attempt in range(max_retries):
	try:
	return func(args, *kwargs)
	except Exception as e:
	# Check for 429 status code in different ways
	status_code = getattr(getattr(e, 'response', None), 'status_code', None)
	if status_code == 429 or '429' in str(e) or 'rate limit' in str(e).lower():
	logging.warning(f"Rate limit hit. Retrying in {delay:.1f} seconds...")
	time.sleep(delay)
	delay *= 2
	else:
	raise
	raise Exception("Max retries exceeded due to rate limits or other persistent errors.")
	return wrapper
	return decorator


	def extract_youtube_video_id(url: str) -> str:
	"""Extract YouTube video ID from various YouTube URL formats."""
	patterns = [
	r'(?:youtube\.com\/watch\?v=\|youtu\.be\/\|youtube\.com\/embed\/\|youtube\.com\/v\/)([^&\n?#]+)',
	r'youtube\.com\/watch\?.*v=([^&\n?#]+)'
	]

	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None

	def get_youtube_thumbnail_url(video_id: str) -> str:
	"""Get the high quality thumbnail URL for a YouTube video."""
	return f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"

	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	)

	def download_and_convert_youtube_audio(url: str) -> str:
	"""
	Downloads audio from a YouTube URL and converts it to a 16kHz mono WAV file.
	Uses a temporary directory for all intermediate files, ensuring cleanup.
	Returns the path to the final temporary WAV file.
	"""
	temp_dir = tempfile.mkdtemp()
	cookie_file_path = None
	try:
	# Check for YouTube cookies in secrets and write to a temporary file
	youtube_cookies = os.environ.get("YOUTUBE_COOKIES")
	if youtube_cookies:
	# Use NamedTemporaryFile to handle the file creation and cleanup
	with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as tmp_cookie_file:
	tmp_cookie_file.write(youtube_cookies)
	cookie_file_path = tmp_cookie_file.name
	print("Using YouTube cookies from secrets.")

	output_tmpl = os.path.join(temp_dir, "audio.%(ext)s")
	ydl_opts = {
	"format": "bestaudio/best",
	"outtmpl": output_tmpl,
	"postprocessors": [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'wav',
	}],
	'postprocessor_args': {
	'extractaudio': ['-ar', '16000', '-ac', '1']
	},
	"quiet": True,
	}

	# Add cookiefile to options if it exists
	if cookie_file_path:
	ydl_opts['cookiefile'] = cookie_file_path

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])

	# Find the downloaded .wav file
	downloaded_files = list(Path(temp_dir).glob("*.wav"))
	if not downloaded_files:
	raise FileNotFoundError("yt-dlp failed to create a WAV file. The video might be protected or unavailable.")

	# Move the final file to a new temporary location so we can clean up the directory
	source_path = downloaded_files[0]
	fd, dest_path = tempfile.mkstemp(suffix=".wav")
	os.close(fd)
	shutil.move(source_path, dest_path)

	return dest_path
	finally:
	# Clean up the cookie file if it was created
	if cookie_file_path and os.path.exists(cookie_file_path):
	os.remove(cookie_file_path)
	shutil.rmtree(temp_dir)

	def handle_transcription(file, url):
	if not file and not (url and url.strip()):
	gr.Warning("Please upload a file or enter a URL.")
	return "Error: Please upload a file or enter a URL."

	gr.Info("Starting secure transcription... This might take a moment.")

	try:
	result = None
	if url and url.strip():
	video_id = extract_youtube_video_id(url)
	if video_id:
	converted_wav_path = None
	try:
	print(f"Detected YouTube URL. Processing locally: {url}")
	converted_wav_path = download_and_convert_youtube_audio(url)

	# Read audio bytes and call ASR service
	with open(converted_wav_path, "rb") as f:
	audio_bytes = f.read()

	print("Sending audio bytes to ASR service.")
	result = asr_handle().transcribe.remote(audio_bytes=audio_bytes)
	finally:
	# Clean up the final temp file
	if converted_wav_path and os.path.exists(converted_wav_path):
	os.remove(converted_wav_path)

	else:
	# Process other URLs remotely and securely.
	print(f"Sending URL to Modal for processing: {url}")
	result = process_media_remotely.remote(url=url)
	elif file is not None:
	# For file uploads:
	# 1. Generate a unique ID for the file.
	upload_id = f"upload-{uuid.uuid4()}"
	print(f"Uploading file to Modal volume with ID: {upload_id}")

	# 2. Upload the local file to the remote volume
	with upload_volume.batch_upload() as batch:
	batch.put_file(file, upload_id)

	# 3. Trigger remote processing by passing the upload ID.
	print(f"Sending upload ID to Modal for processing: {upload_id}")
	result = process_media_remotely.remote(upload_id=upload_id)

	if result.get("error"):
	return f"Error from ASR service: {result['error']}"

	return result["text"]

	except Exception as e:
	print(f"An error occurred: {e}")
	# It's good practice to remove the local temp file if it exists
	if file and os.path.exists(file):
	os.remove(file)
	return f"Error: {str(e)}"
	finally:
	# Gradio's gr.File widget creates a temporary file. We should clean it up.
	if file and os.path.exists(file):
	os.remove(file)

	def add_transcript_to_chat(transcript: str):
	if transcript.startswith("Error"):
	gr.Error("Transcription failed. Please check the logs.")
	return []
	gr.Info("Transcript ready! Generating blog post...")
	# Return empty list for display but store transcript for LLM processing
	return []

	def user_chat(user_message: str, history: list):
	return "", history + [{"role": "user", "content": user_message}]

	@retry_on_rate_limit(max_retries=3, base_delay=1.0)
	def _stream_chat_response(history: list, system_prompt: str, transcript: str = None):
	if not history and not transcript:
	# Don't do anything if there's no history and no transcript
	return

	if transcript and transcript.startswith("Error"):
	return
	# Include transcript as first user message if provided, but don't display it
	messages = [{"role": "system", "content": system_prompt}]
	if transcript:
	messages.append({"role": "user", "content": transcript})
	messages.extend(history)

	stream = client.chat.completions.create(
	model=llm,
	messages=messages,
	stream=True
	)

	history.append({"role": "assistant", "content": ""})
	response_content = ""
	for chunk in stream:
	content = chunk.choices[0].delta.content
	if content:
	response_content += content
	history[-1]["content"] = response_content
	yield history

	def generate_blog_post(history: list, transcript: str, context: str):
	system_prompt = """You are an expert blog writer and editor. Your task is to transform a raw video transcription into a well-structured, engaging, and publish-ready blog post in Markdown format.
	Core Mandate: Erase the Video Origin
	This is a critical function. The reader must not know the content came from a video.
	Eliminate all video-specific language: Remove phrases like "in this video," "thanks for watching," "as you can see here," "welcome to the channel," etc.
	Scrub all platform calls-to-action: No "like and subscribe," "hit the bell icon," or "comment below."
	Remove sponsor messages and ads: Completely omit any sponsor mentions.
	Rephrase visual references: Convert "look at this screen" to a description of the information itself (e.g., "The data reveals that...").
	Content & Formatting Rules:
	Title: Create a compelling, SEO-friendly H1 title.
	Structure: Use ## for main headings and ### for subheadings to create a logical flow.
	Readability: Use short paragraphs, bulleted/numbered lists, and bolding for key terms.
	Refine Prose: Convert conversational speech into clean, professional writing.
	Remove all filler words (um, uh, like, you know).
	Fix grammar and consolidate rambling sentences.
	Flow: Start with a strong introduction and end with a concise summary or conclusion.
	Your output must be a complete, polished article in Markdown."""

	# Combine transcript with additional context if provided
	full_transcript = transcript
	if context and context.strip():
	full_transcript = f"{transcript}\n\n--- Additional Context ---\n{context.strip()}\n\nThis is some additional context relevant to the transcription above."

	yield from _stream_chat_response(history, system_prompt, full_transcript)

	def bot_chat(history: list):
	system_prompt = "You are a helpful assistant that helps refine a blog post created from an audio transcript. The user will provide instructions for changes and you will return only the updated blog post."
	yield from _stream_chat_response(history, system_prompt)

	def update_thumbnail_display(url: str):
	"""Update the thumbnail display when YouTube URL is entered."""
	if not url or not url.strip():
	return gr.update(visible=False, value=None)

	video_id = extract_youtube_video_id(url)
	if video_id:
	thumbnail_url = get_youtube_thumbnail_url(video_id)
	return gr.update(visible=True, value=thumbnail_url)
	else:
	return gr.update(visible=False, value=None)

	# Gradio Interface
	theme = gr.themes.Ocean()
	with gr.Blocks(title="ClipScript", theme=theme) as demo:
	gr.Markdown("# 🎬➡️📝 ClipScript: Video-to-Blog Transformer", elem_classes="hero-title")

	gr.Markdown("### Upload an audio file, or provide a YouTube/direct URL of any size.")
	with gr.Row():
	# Column 1: File input, URL input, and thumbnail
	with gr.Column(scale=1):
	file_input = gr.File(label="Upload any audio file (Recommended)", type="filepath", height=200, file_types=["audio", ".webm", ".mp3", ".mp4", ".m4a", ".ogg", ".wav"])

	with gr.Row():
	with gr.Column():
	url_input = gr.Textbox(
	label="YouTube or Direct Audio URL",
	placeholder="youtube.com/watch?v=... OR xyz.com/audio.mp3",
	scale=2
	)

	# YouTube thumbnail display
	thumbnail_display = gr.Image(
	label="Thumbnail",
	visible=False,
	height=100,
	show_download_button=False,
	interactive=False,
	scale=2
	)

	# Column 2: Transcript view
	with gr.Column(scale=2):
	transcript_output = gr.Textbox(label="Transcription POWERED by Modal Labs", lines=12, interactive=True, show_copy_button=True)

	transcribe_button = gr.Button("Blogify", variant="primary")

	gr.Markdown("---")

	# Add Context section
	context_input = gr.Textbox(
	label="Additional Context",
	placeholder="Enter any additional context, code, articles, or any references that relate to the video content...",
	lines=5,
	interactive=True
	)

	chatbot = gr.Chatbot(
	label="Blog Post", type="messages", height=500, show_copy_all_button=True, show_copy_button=True, show_share_button=True
	)
	chat_input = gr.Textbox(
	label="Your message",
	placeholder="Refine the blog post or ask for changes...",
	container=False,
	)
	clear_button = gr.ClearButton([chat_input, chatbot])


	# Event handlers to disable/enable inputs based on usage
	def on_file_upload(file):
	if file is not None:
	return gr.update(interactive=False), gr.update(visible=False, value=None)
	else:
	return gr.update(interactive=True), gr.update(visible=False, value=None)

	def on_url_change(url):
	if url and url.strip():
	thumbnail_update = update_thumbnail_display(url)
	return gr.update(interactive=False), thumbnail_update
	else:
	return gr.update(interactive=True), gr.update(visible=False, value=None)

	file_input.change(fn=on_file_upload, inputs=file_input, outputs=[url_input, thumbnail_display])
	url_input.change(fn=on_url_change, inputs=url_input, outputs=[file_input, thumbnail_display])

	# Chained events for blog generation
	(
	transcribe_button.click(
	fn=handle_transcription,
	inputs=[file_input, url_input],
	outputs=transcript_output,
	)
	.then(
	fn=lambda: gr.update(value=None, interactive=True),
	outputs=file_input,
	queue=False,
	)
	.then(
	fn=add_transcript_to_chat,
	inputs=transcript_output,
	outputs=chatbot,
	queue=False,
	)
	.then(fn=generate_blog_post, inputs=[chatbot, transcript_output, context_input], outputs=chatbot)
	)

	# Event handler for follow-up chat
	chat_input.submit(
	fn=user_chat,
	inputs=[chat_input, chatbot],
	outputs=[chat_input, chatbot],
	queue=False,
	).then(fn=bot_chat, inputs=chatbot, outputs=chatbot)


	if __name__ == "__main__":
	demo.launch()