import streamlit as st from groq import Groq import base64 from io import BytesIO from typing import List, Dict, Optional import logging import os from PIL import Image import numpy as np # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Initialize Groq client - preferably use environment variables for API keys api_key = os.environ.get("GROQ_API_KEY", "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") client = Groq(api_key=api_key) # Convert PIL image to base64 for Groq API def image_to_base64(image): try: if image is None: logger.error("No image provided to convert to base64") return None # Convert numpy array to PIL Image if necessary if isinstance(image, np.ndarray): image = Image.fromarray(image) buffered = BytesIO() image.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') logger.info(f"Successfully converted image to base64 (length: {len(img_str)})") return img_str except Exception as e: logger.error(f"Error converting image to base64: {str(e)}") return None # Classify query using Compound Beta def classify_query(question: str) -> str: try: logger.info(f"Classifying query: '{question}'") response = client.chat.completions.create( model="compound-beta", messages=[ {"role": "system", "content": "Determine if the following question requires image processing to answer. Respond with 'yes' or 'no'. Do not include any additional text or explanations."}, {"role": "user", "content": question} ], max_tokens=5 ) classification = response.choices[0].message.content.strip().lower() logger.info(f"Query classification result: {classification}") return classification except Exception as e: logger.error(f"Error in query classification: {str(e)}") # Default to text-only on error return "no" # Process image with Llama 4 Maverick def process_image_with_maverick(image, question: str) -> str: try: logger.info("Starting image processing with Maverick") if image is None: logger.error("Image is None, cannot process with Maverick") return "Error: No image provided for analysis." image_base64 = image_to_base64(image) if image_base64 is None: logger.error("Failed to convert image to base64") return "Error: Could not process the image." logger.info("Sending request to Maverick model") # Create the message content with both text and image content = [ {"type": "text", "text": question}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} ] logger.info(f"Content types in request: {[item['type'] for item in content]}") response = client.chat.completions.create( model="meta-llama/llama-4-maverick-17b-128e-instruct", messages=[ { "role": "user", "content": content } ], max_tokens=200 ) maverick_answer = response.choices[0].message.content logger.info(f"Successfully received Maverick response of length: {len(maverick_answer)}") return maverick_answer except Exception as e: logger.error(f"Error in image processing with Maverick: {str(e)}") return f"Error in image processing: {str(e)}" # Generate conversational response with DeepSeek-R1-Distill-Llama-70B def generate_response(prompt: str, history: List[Dict[str, str]]) -> str: try: logger.info(f"Generating response using DeepSeek for prompt: '{prompt[:50]}...'") messages = [{"role": "system", "content": "You are a helpful conversational chatbot. Provide concise, relevant responses based on the input."}] # Add conversation history for entry in history: messages.append({"role": "user", "content": entry["user"]}) messages.append({"role": "assistant", "content": entry["bot"]}) # Add current prompt messages.append({"role": "user", "content": prompt}) logger.info(f"Sending request to DeepSeek with {len(messages)} messages") response = client.chat.completions.create( model="deepseek-r1-distill-llama-70b", messages=messages, max_tokens=500 ) deepseek_response = response.choices[0].message.content.strip() logger.info(f"Successfully received DeepSeek response of length: {len(deepseek_response)}") return deepseek_response except Exception as e: logger.error(f"Error in response generation with DeepSeek: {str(e)}") return f"Error in response generation: {str(e)}" # Process image and question def process_image_and_question(image, question: str, history: List[Dict[str, str]]) -> tuple[str, List[Dict[str, str]]]: # Input validation if not question or question.strip() == "": return "Please provide a question.", history logger.info(f"Processing request - Image provided: {image is not None}, Question: '{question}'") # Check if image is present and valid image_valid = image is not None if image_valid: logger.info("Image is present, determining if image processing is needed") # Classify if we need image processing classification = classify_query(question) needs_image = classification == "yes" logger.info(f"Classification result: {classification}, Will use image: {needs_image}") else: logger.info("No image provided, proceeding with text-only processing") needs_image = False # Process based on classification if needs_image and image_valid: logger.info("Using Maverick for image analysis") maverick_answer = process_image_with_maverick(image, question) # Check if Maverick processing succeeded if maverick_answer and not maverick_answer.startswith("Error"): # Use Maverick's answer as context for DeepSeek prompt = f"Based on image analysis: {maverick_answer}\nPlease respond to the question: {question}" response = generate_response(prompt, history) # Only show the final response, not the image analysis output = f"**Response**: {response}" else: # Fallback to text-only if image processing failed logger.warning(f"Image processing failed, falling back to text-only: {maverick_answer}") response = generate_response(question, history) output = f"**Response**: {response}" else: # Text-only processing logger.info("Using DeepSeek for text-only processing") response = generate_response(question, history) output = f"**Response**: {response}" # Update history with just the question and final response history.append({"user": question, "bot": response}) return output, history # Streamlit app def main(): st.set_page_config(page_title="Conversational Image Recognition Chatbot", layout="wide") st.title("Conversational Image Recognition Chatbot") st.markdown("Ask a question with or without an image. The system will decide whether to analyze the image or respond directly.") # Initialize session state for chat history if 'history' not in st.session_state: st.session_state.history = [] # Create a sidebar for image upload and question input with st.sidebar: st.header("Input") uploaded_file = st.file_uploader("Upload an image (optional)", type=["jpg", "jpeg", "png"]) question = st.text_input("Ask a question") submit_button = st.button("Submit") clear_button = st.button("Clear History") # Display uploaded image if available if uploaded_file is not None: image = Image.open(uploaded_file) # Replace deprecated use_column_width with use_container_width st.image(image, caption="Uploaded Image", use_container_width=True) else: image = None # Process question on submit if submit_button and question: with st.spinner("Processing..."): output, st.session_state.history = process_image_and_question( image, question, st.session_state.history ) # Add to chat display st.session_state.messages = st.session_state.get('messages', []) st.session_state.messages.append({"role": "user", "content": question}) st.session_state.messages.append({"role": "assistant", "content": output}) # Clear history on clear button if clear_button: st.session_state.history = [] st.session_state.messages = [] st.success("Conversation history cleared!") # Display chat history st.header("Conversation") for message in st.session_state.get('messages', []): with st.chat_message(message["role"]): st.markdown(message["content"]) if __name__ == "__main__": main()