import torch from transformers import ( Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForCausalLM, AutoTokenizer ) from qwen_vl_utils import process_vision_info from PIL import Image import cv2 import numpy as np import gradio as gr import spaces from huggingface_hub import login import os # Add quota management constants MAX_GPU_TIME_PER_REQUEST = 59 # seconds COOLDOWN_PERIOD = 300 # 5 minutes in seconds # Add login function at the start def init_huggingface_auth(): # Get token from environment variable or set it directly token = os.getenv("HUGGINGFACE_TOKEN") if token: login(token=token) print("Successfully authenticated with Hugging Face") else: raise ValueError("HUGGINGFACE_TOKEN not found in environment variables") # Load both models and their processors/tokenizers def load_models(): try: # Initialize HF auth before loading models init_huggingface_auth() # Vision model vision_model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.float16, device_map="auto", use_auth_token=True # Add auth token usage ) vision_processor = AutoProcessor.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", use_auth_token=True # Add auth token usage ) # Code model code_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2.5-Coder-1.5B-Instruct", torch_dtype=torch.float16, device_map="auto", use_auth_token=True # Add auth token usage ) code_tokenizer = AutoTokenizer.from_pretrained( "Qwen/Qwen2.5-Coder-1.5B-Instruct", use_auth_token=True # Add auth token usage ) # Free up CUDA memory after loading torch.cuda.empty_cache() return vision_model, vision_processor, code_model, code_tokenizer except Exception as e: print(f"Error loading models: {str(e)}") raise vision_model, vision_processor, code_model, code_tokenizer = load_models() VISION_SYSTEM_PROMPT = """Extract code from images/videos: 1. Output exact code snippets only 2. Keep original formatting/indentation focus on code-relevant frames only [code] If multiple code sections are visible, separate them with --- Note: In video, irrelevant frames may occur (e.g., other windows tabs, eterniq website, etc.) in video. Please focus on code-specific frames as we have to extract that content only. """ CODE_SYSTEM_PROMPT = """Debug code as an expert: - Analyze OCR-extracted code + user's issue - Find bugs/issues - Provide fixes - Explain corrections Output: Fixed Code: [corrected code] Original Issue: [brief analysis] Note: Please provide the output in a well-structured Markdown format. Remove all unnecessary information and exclude any additional code formatting such as triple backticks or language identifiers. """ def process_video_for_code(video_path, transcribed_text, max_frames=16, frame_interval=30): cap = cv2.VideoCapture(video_path) frames = [] frame_count = 0 while len(frames) < max_frames: ret, frame = cap.read() if not ret: break if frame_count % frame_interval == 0: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) frames.append(frame) frame_count += 1 cap.release() if not frames: return "No frames could be extracted from the video.", "No code could be analyzed." # Process all frames vision_descriptions = [] for frame in frames: vision_description = process_image_for_vision(frame, transcribed_text) vision_descriptions.append(vision_description) # Combine all vision descriptions combined_vision_description = "\n\n".join(vision_descriptions) # Use code model to fix the code based on combined description fixed_code_response = process_for_code(combined_vision_description) return combined_vision_description, fixed_code_response def process_image_for_vision(image, transcribed_text): vision_messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": f"{VISION_SYSTEM_PROMPT}\n\nDescribe the code and any errors you see in this image. User's description: {transcribed_text}"}, ], } ] vision_text = vision_processor.apply_chat_template( vision_messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(vision_messages) vision_inputs = vision_processor( text=[vision_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(vision_model.device) with torch.no_grad(): vision_output_ids = vision_model.generate(**vision_inputs, max_new_tokens=512) vision_output_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(vision_inputs.input_ids, vision_output_ids) ] return vision_processor.batch_decode( vision_output_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] def process_for_code(vision_description): code_messages = [ {"role": "system", "content": CODE_SYSTEM_PROMPT}, {"role": "user", "content": f"Here's a description of code with errors:\n\n{vision_description}\n\nPlease analyze and fix the code."} ] code_text = code_tokenizer.apply_chat_template( code_messages, tokenize=False, add_generation_prompt=True ) code_inputs = code_tokenizer([code_text], return_tensors="pt").to(code_model.device) with torch.no_grad(): code_output_ids = code_model.generate( **code_inputs, max_new_tokens=1024, temperature=0.7, top_p=0.95, ) code_output_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(code_inputs.input_ids, code_output_ids) ] return code_tokenizer.batch_decode( code_output_trimmed, skip_special_tokens=True )[0] @spaces.GPU def process_content(video, transcribed_text): try: if video is None: return "Please upload a video file of code with errors.", "" # Add GPU memory management torch.cuda.empty_cache() # Check available GPU memory if torch.cuda.is_available(): available_memory = torch.cuda.get_device_properties(0).total_memory if available_memory < 1e9: # Less than 1GB available raise RuntimeError("Insufficient GPU memory available") vision_output, code_output = process_video_for_code( video.name, transcribed_text, max_frames=8 # Reduced from 16 to lower GPU usage ) return vision_output, code_output except spaces.zero.gradio.HTMLError as e: if "exceeded your GPU quota" in str(e): return ( "GPU quota exceeded. Please try again later or consider upgrading to a paid plan.", "" ) except Exception as e: return f"Error processing content: {str(e)}", "" finally: # Clean up GPU memory torch.cuda.empty_cache() # Gradio interface iface = gr.Interface( fn=process_content, inputs=[ gr.File(label="Upload Video of Code with Errors"), gr.Textbox(label="Transcribed Audio") ], outputs=[ gr.Textbox(label="Vision Model Output (Code Description)"), gr.Code(label="Fixed Code", language="python") ], title="Vision Code Debugger", description="Upload a video of code with errors and provide transcribed audio, and the AI will analyze and fix the issues.", allow_flagging="never", # Disable flagging to reduce overhead cache_examples=True # Enable caching to reduce GPU usage ) if __name__ == "__main__": iface.launch(show_error=True)