from PIL import Image as PILImage import torch from transformers import AutoProcessor, AutoModelForCausalLM import streamlit as st # Define your custom pipeline function def custom_image_to_text_pipeline(image, processor, model, device): # Preprocess the image inputs = processor(images=image, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} # Generate predictions output = model.generate(**inputs) # Decode the output to text decoded_output = processor.decode(output[0], skip_special_tokens=True) return decoded_output # Load your model and processor device = torch.device("cuda") processor = AutoProcessor.from_pretrained("HuggingFaceM4/VLM_WebSight_finetuned") model = AutoModelForCausalLM.from_pretrained( "HuggingFaceM4/VLM_WebSight_finetuned", trust_remote_code=True, torch_dtype=torch.bfloat16, ).to(device) # Streamlit UI st.title("Image to Code Converter") uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"]) if uploaded_file is not None: # Display the uploaded image st.image(uploaded_file, caption='Uploaded Image.', use_column_width=True) st.write("") # Convert the file to an image and process it with PILImage.open(uploaded_file) as image: st.write("Converting image to code...") code_result = custom_image_to_text_pipeline(image, processor, model, device) # Display the code st.code(code_result)