Vintern-1B-v3_5-Demo

Running on Zero

App Files Files Community

baohuynhbk14 commited on 21 days ago

Commit

4d0481d

1 Parent(s): 46c8f02

Refactor message handling in conversation and prediction functions to improve clarity and functionality

Browse files

Files changed (3) hide show

app.py +26 -14
conversation.py +15 -6
models.py +2 -9

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from filelock import FileLock
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
 from models import load_image
-from constants import LOGDIR
 from utils import (
     build_logger,
     server_error_msg,
@@ -164,6 +164,10 @@ def add_text(state, message, system_prompt, request: gr.Request):
     if len(images) > 0 and len(state.get_images(source=state.USER)) > 0:
         state = init_state(state)
     state.set_system_message(system_prompt)
     state.append_message(Conversation.USER, text, images)
     state.skip_next = False
@@ -183,19 +187,29 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, us
 @spaces.GPU
 def predict(message,
             image_path,
-            history,
             max_input_tiles=6,
             temperature=1.0,
             max_output_tokens=700,
             top_p=0.7,
             repetition_penalty=2.5):
-        pixel_values = load_image(image_path, max_num=max_input_tiles).to(torch.bfloat16).cuda()
         generation_config = dict(temperature=temperature, max_new_tokens= max_output_tokens, top_p=top_p, do_sample=False, num_beams = 3, repetition_penalty=repetition_penalty)
-        if pixel_values is not None:
-            question = '<image>\n'+message
-        else:
-            question = message
-        print(f"FULL predict question: {question}")
         response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
         return response, conv_history
@@ -246,21 +260,19 @@ def http_bot(
     try:
         # Stream output
-        message = state.get_last_user_message(source=state.USER)
         logger.info(f"==== User message ====\n{message}")
         logger.info(f"==== Image paths ====\n{all_image_paths}")
-        history = state.get_prompt()
-        logger.info(f"==== History ====\n{history}")
-        response, conv_history = predict(message,
                                          all_image_paths[0],
-                                         history,
                                          max_input_tiles,
                                          temperature,
                                          max_new_tokens,
                                          top_p,
                                          repetition_penalty)
-        logger.info(f"==== AI history ====\n{conv_history}")
         # response = "This is a test response"

 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
 from models import load_image
+from constants import LOGDIR, DEFAULT_IMAGE_TOKEN
 from utils import (
     build_logger,
     server_error_msg,
     if len(images) > 0 and len(state.get_images(source=state.USER)) > 0:
         state = init_state(state)
+    if len(images) > 0 and len(state.get_images(source=state.USER)) == 0:
+        text = DEFAULT_IMAGE_TOKEN + "\n" + text
     state.set_system_message(system_prompt)
     state.append_message(Conversation.USER, text, images)
     state.skip_next = False
 @spaces.GPU
 def predict(message,
             image_path,
+            state,
             max_input_tiles=6,
             temperature=1.0,
             max_output_tokens=700,
             top_p=0.7,
             repetition_penalty=2.5):
+        history = state.get_prompt()
+        logger.info(f"==== History ====\n{history}")
         generation_config = dict(temperature=temperature, max_new_tokens= max_output_tokens, top_p=top_p, do_sample=False, num_beams = 3, repetition_penalty=repetition_penalty)
+        question = message
+        pixel_values = None
+        if image_path is not None:
+            pixel_values = load_image(image_path, max_num=max_input_tiles).to(torch.bfloat16).cuda()
+            if pixel_values is not None:
+               # Check the first user message to see if it is an image
+                index, first_user_message = state.get_user_message(source=state.USER, position='first')
+                if first_user_message is not None and \
+                    DEFAULT_IMAGE_TOKEN not in first_user_message:
+                    state.messages[index]['content'] = DEFAULT_IMAGE_TOKEN + "\n" + first_user_message
         response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
         return response, conv_history
     try:
         # Stream output
+        message = state.get_user_message(source=state.USER, position='last')
         logger.info(f"==== User message ====\n{message}")
         logger.info(f"==== Image paths ====\n{all_image_paths}")
+        response, _ = predict(message,
                                          all_image_paths[0],
+                                         state,
                                          max_input_tiles,
                                          temperature,
                                          max_new_tokens,
                                          top_p,
                                          repetition_penalty)
+        # logger.info(f"==== AI history ====\n{conv_history}")
         # response = "This is a test response"

conversation.py CHANGED Viewed

@@ -174,14 +174,23 @@ class Conversation:
         return images
-    def get_last_user_message(self, source: Union[str, None] = None):
         assert len(self.messages) > 0, "No message in the conversation."
         assert source in [self.USER, self.ASSISTANT, None], f"Invalid source: {source}"
-        for i in range(len(self.messages) - 1, -1, -1):
-            if source and self.messages[i]["role"] != source:
-                continue
-            if self.messages[i]["role"] == self.USER:
-                return self.messages[i]["content"]
     def to_gradio_chatbot(self):
         ret = []

         return images
+    def get_user_message(self, source: Union[str, None] = None, position="first"):
         assert len(self.messages) > 0, "No message in the conversation."
         assert source in [self.USER, self.ASSISTANT, None], f"Invalid source: {source}"
+        if position == "first":
+            for i, msg in enumerate(self.messages):
+                if source and msg["role"] != source:
+                    continue
+                if msg["role"] == self.USER:
+                    return i, msg["content"]
+        elif position == "last":
+            for i in range(len(self.messages) - 1, -1, -1):
+                if source and self.messages[i]["role"] != source:
+                    continue
+                if self.messages[i]["role"] == self.USER:
+                    return i, self.messages[i]["content"]
     def to_gradio_chatbot(self):
         ret = []

models.py CHANGED Viewed

@@ -74,16 +74,12 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna
     return processed_images
 def correct_image_orientation(image_path):
-    # Mở ảnh
     image = Image.open(image_path)
-    # Kiểm tra dữ liệu Exif (nếu có)
     try:
         exif = image._getexif()
         if exif is not None:
             for tag, value in exif.items():
                 if ExifTags.TAGS.get(tag) == "Orientation":
-                    # Sửa hướng dựa trên Orientation
                     if value == 3:
                         image = image.rotate(180, expand=True)
                     elif value == 6:
@@ -92,7 +88,8 @@ def correct_image_orientation(image_path):
                         image = image.rotate(90, expand=True)
                     break
     except Exception as e:
-        print("Không thể xử lý Exif:", e)
     return image
@@ -100,13 +97,9 @@ def load_image(image_file, input_size=448, max_num=12):
     try:
         print("Loading image:", image_file)
         image = correct_image_orientation(image_file).convert('RGB')
-        print("Image size:", image.size)
         transform = build_transform(input_size=input_size)
-        print("Transform built.")
         images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
-        print("Number of images:", len(images))
         pixel_values = [transform(image) for image in images]
-        print("Images transformed.")
         pixel_values = torch.stack(pixel_values)
         print("Image loaded successfully.")
     except Exception as e:

     return processed_images
 def correct_image_orientation(image_path):
     image = Image.open(image_path)
     try:
         exif = image._getexif()
         if exif is not None:
             for tag, value in exif.items():
                 if ExifTags.TAGS.get(tag) == "Orientation":
                     if value == 3:
                         image = image.rotate(180, expand=True)
                     elif value == 6:
                         image = image.rotate(90, expand=True)
                     break
     except Exception as e:
+        print("Error reading exif:", e)
+        print(traceback.format_exc())
     return image
     try:
         print("Loading image:", image_file)
         image = correct_image_orientation(image_file).convert('RGB')
         transform = build_transform(input_size=input_size)
         images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
         pixel_values = [transform(image) for image in images]
         pixel_values = torch.stack(pixel_values)
         print("Image loaded successfully.")
     except Exception as e: