Spaces:
Runtime error
Runtime error
| # | |
| import gradio as gr | |
| import torch | |
| from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer | |
| from byaldi import RAGMultiModalModel | |
| from PIL import Image | |
| from auto_gptq import GPTQForModel # Import GPTQ for quantization | |
| # Check for CUDA availability | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Using device: {device}") | |
| # Load and quantize the Qwen2 model with GPTQ | |
| model = Qwen2VLForConditionalGeneration.from_pretrained( | |
| "Qwen/Qwen2-VL-7B-Instruct", | |
| torch_dtype=torch.float16, # Use float16 for initial load | |
| device_map="auto", | |
| low_cpu_mem_usage=True, | |
| ) | |
| model = GPTQForModel.from_pretrained(model) # Load the model into GPTQ | |
| model.quantize(bits=8) # Quantize to 8-bit | |
| model.to(device) | |
| model.eval() # Set the model to evaluation mode | |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") | |
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") | |
| # Load and quantize the RAG model with GPTQ | |
| RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2") | |
| RAG = GPTQForModel.from_pretrained(RAG) # Load the RAG model into GPTQ | |
| RAG.quantize(bits=8) # Quantize to 8-bit | |
| RAG.to(device) | |
| RAG.eval() # Set RAG model to evaluation mode | |
| def process_image(image, keywords): | |
| # Perform OCR | |
| ocr_result = RAG.ocr(image) | |
| # Process image | |
| prompt = f"Analyze this text: {ocr_result}\n1. Identify Hindi and English parts.\n2. Translate Hindi to English.\n3. Summarize content." | |
| inputs = processor(images=image, text=prompt, return_tensors="pt") | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| # Generate output with quantized model | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=500, | |
| do_sample=True, | |
| top_k=50, | |
| top_p=0.95, | |
| num_return_sequences=1, | |
| ) | |
| analysis = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Search for keywords | |
| keyword_list = [kw.strip() for kw in keywords.split(',')] | |
| found_keywords = [kw for kw in keyword_list if kw.lower() in analysis.lower()] | |
| return analysis, ', '.join(found_keywords) if found_keywords else "No keywords found" | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=process_image, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload Image"), | |
| gr.Textbox(label="Enter keywords (comma-separated)") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Analysis Result"), | |
| gr.Textbox(label="Found Keywords") | |
| ], | |
| title="Image OCR and Keyword Search (Quantized Model)", | |
| description="Upload an image to extract and analyze text, then search for specific keywords. This version uses a quantized model for improved efficiency." | |
| ) | |
| # Launch the interface | |
| iface.launch() | |