import gradio as gr import os import nltk from transformers import VisionEncoderDecoderModel, AutoTokenizer, ViTImageProcessor, pipeline import torch from PIL import Image from nltk.corpus import stopwords from io import BytesIO nltk.download('stopwords') model = VisionEncoderDecoderModel.from_pretrained("SumanthKarnati/Image2Ingredients") model.eval() feature_extractor = ViTImageProcessor.from_pretrained('nlpconnect/vit-gpt2-image-captioning') tokenizer = AutoTokenizer.from_pretrained('nlpconnect/vit-gpt2-image-captioning') generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) max_length = 16 num_beams = 4 gen_kwargs = {"max_length": max_length, "num_beams": num_beams} stop_words = set(stopwords.words('english')) def remove_stop_words(word_list): return [word for word in word_list if word not in stop_words] def predict_step(image_files, model, feature_extractor, tokenizer, device, gen_kwargs): images = [] for image_file in image_files: if image_file is not None: image = Image.open(image_file.name) if image.mode != "RGB": image = image.convert(mode="RGB") images.append(image) if not images: return None inputs = feature_extractor(images=images, return_tensors="pt") inputs.to(device) output_ids = model.generate(inputs["pixel_values"], **gen_kwargs) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds def process_image(image): preds = predict_step([image], model, feature_extractor, tokenizer, device, gen_kwargs) preds = preds[0].split('-') preds = [x for x in preds if not any(c.isdigit() for c in x)] preds = list(filter(None, preds)) preds = list(dict.fromkeys(preds)) preds = remove_stop_words(preds) preds_str = ', '.join(preds) prompt = f"You are a knowledgeable assistant that provides nutritional advice based on a list of ingredients. The identified ingredients are: {preds_str}. Note that some ingredients may not make sense, so use the ones that do. Can you provide a nutritional analysis and suggestions for improvement?" suggestions = generator(prompt, do_sample=True, min_length=200) suggestions = suggestions[0]['generated_text'][len(prompt):] return preds, suggestions iface = gr.Interface(fn=process_image, inputs="image", outputs=["text", "text"]) iface.launch()