|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig |
|
from PIL import Image |
|
import requests |
|
|
|
|
|
processor = AutoProcessor.from_pretrained( |
|
'allenai/Molmo-7B-D-0924', |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto' |
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
'allenai/Molmo-7B-D-0924', |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto' |
|
) |
|
|
|
def describe_image(image): |
|
|
|
inputs = processor.process( |
|
images=[image], |
|
text='''an image of a human sitting properly , with a laptop/pc clearly visible and the student’s face at least 40%-50% visible. The student should be looking at the laptop screen with both hands on the keyboard. There should be no other accessories other than laptop/pc, and no other second person should be present ." // analyse image on this conditions // if all condition satisfied answer YES else NO// Answer only in YES or NO''' |
|
) |
|
|
|
|
|
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()} |
|
|
|
|
|
output = model.generate_from_batch( |
|
inputs, |
|
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"), |
|
tokenizer=processor.tokenizer |
|
) |
|
|
|
|
|
generated_tokens = output[0, inputs['input_ids'].size(1):] |
|
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) |
|
|
|
return generated_text |
|
|
|
|
|
iface = gr.Interface( |
|
fn=describe_image, |
|
inputs=gr.Image(type="pil", label="Upload an Image"), |
|
outputs=gr.Textbox(label="Description"), |
|
title="OPPE", |
|
description="OPPE VERRFICATION." |
|
) |
|
|
|
|
|
iface.launch() |
|
|