| | import streamlit as st |
| | from transformers import pipeline |
| | from PIL import Image |
| | import torch |
| | import os |
| |
|
| | |
| | os.environ["TRANSFORMERS_CACHE"] = "/app/cache/transformers" |
| | os.environ["HF_HOME"] = "/app/cache/hf" |
| | os.environ["HF_HUB_CACHE"] = "/app/cache/hf" |
| |
|
| | |
| | hf_token = os.getenv("HF_TOKEN") |
| | if hf_token: |
| | os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token |
| |
|
| | |
| | st.set_page_config( |
| | page_title="Gemma-3n E4B Vision-Language Model", |
| | page_icon="π€", |
| | layout="wide" |
| | ) |
| |
|
| | @st.cache_resource |
| | def load_model(): |
| | """Load the model pipeline with caching""" |
| | try: |
| | |
| | if not hf_token: |
| | st.error("HF_TOKEN not found in environment variables") |
| | return None |
| | |
| | |
| | pipe = pipeline( |
| | "image-text-to-text", |
| | model="google/gemma-3n-E4B-it", |
| | torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| | device_map="auto" if torch.cuda.is_available() else "cpu", |
| | token=hf_token |
| | ) |
| | return pipe |
| | except Exception as e: |
| | st.error(f"Error loading model: {str(e)}") |
| | st.error("Make sure you have access to the model and your token is valid.") |
| | return None |
| |
|
| | def generate_response(pipe, image, text_prompt, max_tokens=100): |
| | """Generate response from the model""" |
| | try: |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image", "image": image}, |
| | {"type": "text", "text": text_prompt} |
| | ] |
| | } |
| | ] |
| | |
| | |
| | response = pipe(messages, max_new_tokens=max_tokens) |
| | |
| | |
| | if isinstance(response, list) and len(response) > 0: |
| | if isinstance(response[0], dict) and 'generated_text' in response[0]: |
| | return response[0]['generated_text'] |
| | elif isinstance(response[0], str): |
| | return response[0] |
| | |
| | return str(response) |
| | |
| | except Exception as e: |
| | return f"Error generating response: {str(e)}" |
| |
|
| | def main(): |
| | st.title("π€ Gemma-3n E4B Vision-Language Model") |
| | st.markdown("Upload an image and ask questions about it!") |
| | |
| | |
| | if not hf_token: |
| | st.error("β HuggingFace token not found in environment variables.") |
| | st.markdown(""" |
| | **To fix this:** |
| | 1. Go to your Space settings (βοΈ icon) |
| | 2. Navigate to "Repository secrets" |
| | 3. Add a secret with name: `HF_TOKEN` |
| | 4. Value: Your HuggingFace token |
| | 5. Restart the Space |
| | """) |
| | return |
| | else: |
| | st.success("β
HuggingFace token found!") |
| | |
| | |
| | st.sidebar.markdown("### π Setup Status") |
| | st.sidebar.markdown(f""" |
| | β
**Token**: Found in environment |
| | |
| | Make sure you have: |
| | 1. β
Access to the gated model |
| | 2. β
Added your HF token to Space secrets |
| | 3. β
Token has proper permissions |
| | """) |
| | |
| | |
| | with st.spinner("Loading model... This may take a few minutes on first run."): |
| | pipe = load_model() |
| | |
| | if pipe is None: |
| | st.error("Failed to load model. Please check your setup and try again.") |
| | return |
| | |
| | st.success("Model loaded successfully!") |
| | |
| | |
| | col1, col2 = st.columns([1, 1]) |
| | |
| | with col1: |
| | st.subheader("π€ Input") |
| | |
| | |
| | uploaded_file = st.file_uploader( |
| | "Choose an image...", |
| | type=['png', 'jpg', 'jpeg', 'gif', 'bmp'], |
| | help="Upload an image to analyze" |
| | ) |
| | |
| | |
| | text_prompt = st.text_area( |
| | "Ask a question about the image:", |
| | placeholder="What do you see in this image?", |
| | height=100 |
| | ) |
| | |
| | |
| | max_tokens = st.slider( |
| | "Max tokens to generate:", |
| | min_value=10, |
| | max_value=200, |
| | value=100, |
| | help="Maximum number of tokens to generate" |
| | ) |
| | |
| | |
| | generate_btn = st.button("π Generate Response", type="primary") |
| | |
| | with col2: |
| | st.subheader("π€ Output") |
| | |
| | if uploaded_file is not None: |
| | |
| | image = Image.open(uploaded_file) |
| | st.image(image, caption="Uploaded image", use_column_width=True) |
| | |
| | |
| | if generate_btn: |
| | if not text_prompt.strip(): |
| | st.warning("Please enter a question about the image.") |
| | else: |
| | with st.spinner("Generating response..."): |
| | response = generate_response( |
| | pipe, image, text_prompt, max_tokens |
| | ) |
| | |
| | st.subheader("π€ Model Response:") |
| | st.write(response) |
| | else: |
| | st.info("π Please upload an image to get started") |
| | |
| | |
| | st.markdown("---") |
| | st.subheader("π‘ Example Questions to Try:") |
| | st.markdown(""" |
| | - What objects do you see in this image? |
| | - Describe the scene in detail |
| | - What colors are present in the image? |
| | - What is the main subject of this image? |
| | - Can you identify any text in this image? |
| | """) |
| | |
| | |
| | st.markdown("---") |
| | st.markdown( |
| | "Built with β€οΈ using [Streamlit](https://streamlit.io) and " |
| | "[Hugging Face Transformers](https://huggingface.co/transformers/)" |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | main() |