Spaces:

phyloforfun
/

VoucherVision

Running

App Files Files Community

phyloforfun commited on Jul 19

Commit

a145e37

•

1 Parent(s): 94dfdfd

req

Browse files

Files changed (10) hide show

app.py +145 -77
vouchervision/OCR_Florence_2.py +41 -15
vouchervision/OCR_GPT4oMini.py +94 -0
vouchervision/OCR_google_cloud_vision.py +63 -41
vouchervision/VoucherVision_Config_Builder.py +6 -3
vouchervision/general_utils.py +23 -6
vouchervision/model_maps.py +34 -14
vouchervision/utils_LLM.py +25 -9
vouchervision/utils_VoucherVision.py +16 -3
vouchervision/vouchervision_main.py +2 -2

app.py CHANGED Viewed

@@ -218,10 +218,10 @@ if 'dir_images_local_TEMP' not in st.session_state:
     st.session_state['dir_images_local_TEMP'] = False
 if 'dir_uploaded_images' not in st.session_state:
     st.session_state['dir_uploaded_images'] = os.path.join(st.session_state.dir_home,'uploads')
-    validate_dir(st.session_state['dir_uploaded_images'])
 if 'dir_uploaded_images_small' not in st.session_state:
     st.session_state['dir_uploaded_images_small'] = os.path.join(st.session_state.dir_home,'uploads_small')
-    validate_dir(st.session_state['dir_uploaded_images_small'])
@@ -264,16 +264,18 @@ def handle_image_upload_and_gallery_hf(uploaded_files):
         ind_small = 0
         for uploaded_file in uploaded_files:
             if SAFE.check_for_inappropriate_content(uploaded_file):
                 clear_image_uploads()
                 report_violation(uploaded_file.name, is_hf=st.session_state['is_hf'])
                 st.error("Warning: You uploaded an image that violates our terms of service.")
             # Determine the file type
             if uploaded_file.name.lower().endswith('.pdf'):
                 # Handle PDF files
-                file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file, image=None)
                 # Convert each page of the PDF to an image
                 n_pages = convert_pdf_to_jpg(file_path, st.session_state['dir_uploaded_images'], dpi=200)#st.session_state.config['leafmachine']['project']['dir_images_local'])
                 # Update the input list for each page image
@@ -288,27 +290,22 @@ def handle_image_upload_and_gallery_hf(uploaded_files):
                             # Optionally, create a thumbnail for the gallery
                             img = Image.open(jpg_file_path)
                             img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
-                            if st.session_state['is_hf']:
                                 file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], file_name, img)
-                            else:
                                 file_path_small = save_uploaded_file_local(st.session_state['dir_uploaded_images_small'],st.session_state['dir_uploaded_images_small'], file_name, img)
                             st.session_state['input_list_small'].append(file_path_small)
             else:
                 ind_small += 1
                 # Handle JPG/JPEG files (existing process)
-                # file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file, image=None) ######### Yale  TODO
-                # file_path = os.path.join(st.session_state['dir_uploaded_images'], uploaded_file.name)
-                image = Image.open(uploaded_file)
-                file_path = os.path.join(st.session_state['dir_uploaded_images'], uploaded_file.name)
-                image.save(file_path, "JPEG")
                 st.session_state['input_list'].append(file_path)
-                # if ind_small < MAX_GALLERY_IMAGES +5:
-                #     img = Image.open(file_path)
-                #     img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
-                #     file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], uploaded_file, img)
-                #     st.session_state['input_list_small'].append(file_path_small)
         # After processing all files
         st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
@@ -396,7 +393,7 @@ def content_input_images(col_left, col_right):
     with col_right:
         if st.session_state.is_hf:
-            handle_image_upload_and_gallery_hf(uploaded_files)
         else:
             st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
@@ -1767,12 +1764,47 @@ def content_prompt_and_llm_version():
             st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
-    st.header('LLM Version')
-    col_llm_1, col_llm_2 = st.columns([4,2])
     with col_llm_1:
-        GUI_MODEL_LIST = ModelMaps.get_models_gui_list()
-        st.session_state.config['leafmachine']['LLM_version'] = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(st.session_state.config['leafmachine'].get('LLM_version', ModelMaps.MODELS_GUI_DEFAULT)))
         st.markdown("""
 Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
 - Any Mistral model e.g., `Mistral Large`
@@ -1815,25 +1847,43 @@ def content_api_check():
-def adjust_ocr_options_based_on_capability(capability_score):
-    llava_models_requirements = {
-        "liuhaotian/llava-v1.6-mistral-7b": {"full": 18, "4bit": 9},
-        "liuhaotian/llava-v1.6-34b": {"full": 70, "4bit": 25},
-        "liuhaotian/llava-v1.6-vicuna-13b": {"full": 33, "4bit": 15},
-        "liuhaotian/llava-v1.6-vicuna-7b": {"full": 20, "4bit": 10},
-    }
-    if capability_score == 'no_gpu':
-        return False
-    else:
-        capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
-        supported_models = [model for model, reqs in llava_models_requirements.items()
-                            if reqs["full"] <= capability_score_n or reqs["4bit"] <= capability_score_n]
-        # If no models are supported, disable the LLaVA option
-        if not supported_models:
-            # Assuming the LLaVA option is the last in your list
-            return False  # Indicate LLaVA is not supported
-        return True  # Indicate LLaVA is supported
@@ -1867,12 +1917,22 @@ def content_ocr_method():
     c1, c2 = st.columns([4,4])
-    # Check if LLaVA models are supported based on capability score
-    llava_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score)
-    if llava_supported:
-        st.success("LLaVA models are supported on this computer")
-    else:
-        st.warning("LLaVA models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
     demo_text_h = f"Google_OCR_Handwriting:\nHERBARIUM OF MARCUS W. LYON , JR . Tracaulon sagittatum Indiana : Porter Co. incal Springs edge wet subdunal woods 1927 TX 11 Ilowers pink UNIVERSITE HERBARIUM MICH University of Michigan Herbarium 1439649 copyright reserved PERSICARIA FEB 2 6 1965 cm "
     demo_text_tr = f"trOCR:\nherbarium of marcus w. lyon jr. : : : tracaulon sagittatum indiana porter co. incal springs TX 11 Ilowers pink  1439649 copyright reserved D H U Q "
@@ -1882,7 +1942,7 @@ def content_ocr_method():
     demo_text_trh = demo_text_h + '\n' + demo_text_tr
     demo_text_trp = demo_text_p + '\n' + demo_text_tr
-    options = ["Google Vision Handwritten", "Google Vision Printed", "CRAFT + trOCR","LLaVA", "Florence-2"]
     options_llava = ["llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",]
     options_llava_bit = ["full", "4bit",]
     captions_llava = [
@@ -1905,7 +1965,7 @@ def content_ocr_method():
     default_index_llava_bit = 0
     with c1:
         st.subheader("API Methods (Google Vision)")
-        st.write("Using APIs for OCR allows VoucherVision to run on most computers.")
         st.session_state.config['leafmachine']['project']['double_OCR'] = st.checkbox(label="Send 2 copies of the OCR to the LLM",
                                                                                       help="This can help the LLMs focus attention on the OCR and not get lost in the longer instruction text",
@@ -1934,6 +1994,7 @@ def content_ocr_method():
             "CRAFT + trOCR": 'CRAFT',
             "LLaVA": 'LLaVA',
             "Florence-2": 'Florence-2',
         }
         # Map selected options to their corresponding internal representations
@@ -1943,45 +2004,52 @@ def content_ocr_method():
         st.session_state.config['leafmachine']['project']['OCR_option'] = selected_OCR_options
-    with c2:
-        st.subheader("Local Methods")
-        st.write("Local methods are free, but require a capable GPU. ")
-    st.write("Supplement Google Vision OCR with trOCR (handwriting OCR) using `microsoft/trocr-base-handwritten`. This option requires Google Vision API and a GPU.")
     if 'CRAFT' in selected_OCR_options:
-        do_use_trOCR = st.checkbox("Enable trOCR", value=True, key="Enable trOCR1",disabled=True)#,disabled=st.session_state['lacks_GPU'])
-    else:
-        do_use_trOCR = st.checkbox("Enable trOCR", value=st.session_state.config['leafmachine']['project']['do_use_trOCR'],key="Enable trOCR2")#,disabled=st.session_state['lacks_GPU'])
-        st.session_state.config['leafmachine']['project']['do_use_trOCR'] = do_use_trOCR
-    if do_use_trOCR:
-        # st.session_state.config['leafmachine']['project']['trOCR_model_path'] = "microsoft/trocr-large-handwritten"
-        default_trOCR_model_path = st.session_state.config['leafmachine']['project']['trOCR_model_path']
-        user_input_trOCR_model_path = st.text_input("trOCR Hugging Face model path. MUST be a fine-tuned version of 'microsoft/trocr-base-handwritten' or 'microsoft/trocr-large-handwritten', or a microsoft trOCR model.", value=default_trOCR_model_path)
-        if st.session_state.config['leafmachine']['project']['trOCR_model_path'] != user_input_trOCR_model_path:
-            is_valid_mp = is_valid_huggingface_model_path(user_input_trOCR_model_path)
-            if not is_valid_mp:
-                st.error(f"The Hugging Face model path {user_input_trOCR_model_path} is not valid. Please revise.")
-            else:
-                st.session_state.config['leafmachine']['project']['trOCR_model_path'] = user_input_trOCR_model_path
     if "Florence-2" in selected_OCR_options:
         default_florence_model_path = st.session_state.config['leafmachine']['project']['florence_model_path']
-        user_input_florence_model_path = st.text_input("Florence-2 Hugging Face model path. MUST be a Florence-2 version based on 'microsoft/Florence-2-large' or similar.", value=default_florence_model_path)
-        if st.session_state.config['leafmachine']['project']['florence_model_path'] != user_input_florence_model_path:
-            is_valid_mp = is_valid_huggingface_model_path(user_input_florence_model_path)
-            if not is_valid_mp:
-                st.error(f"The Hugging Face model path {user_input_florence_model_path} is not valid. Please revise.")
-            else:
-                st.session_state.config['leafmachine']['project']['florence_model_path'] = user_input_florence_model_path
     if 'LLaVA' in selected_OCR_options:
         OCR_option_llava = st.radio(
-            "Select the LLaVA version",
             options_llava,
             index=default_index_llava,
             help="",captions=captions_llava,
@@ -1989,12 +2057,13 @@ def content_ocr_method():
         st.session_state.config['leafmachine']['project']['OCR_option_llava'] = OCR_option_llava
         OCR_option_llava_bit = st.radio(
-            "Select the LLaVA quantization level",
             options_llava_bit,
             index=default_index_llava_bit,
             help="",captions=captions_llava_bit,
         )
         st.session_state.config['leafmachine']['project']['OCR_option_llava_bit'] = OCR_option_llava_bit
@@ -2045,7 +2114,6 @@ def show_ocr():
         # st.image(st.session_state["demo_overlay"], caption='OCR Overlay Images', output_format = "JPEG")
 def content_collage_overlay():
-    st.markdown("---")
     col_collage, col_overlay = st.columns([4,4])

     st.session_state['dir_images_local_TEMP'] = False
 if 'dir_uploaded_images' not in st.session_state:
     st.session_state['dir_uploaded_images'] = os.path.join(st.session_state.dir_home,'uploads')
+    validate_dir(os.path.join(st.session_state.dir_home,'uploads'))
 if 'dir_uploaded_images_small' not in st.session_state:
     st.session_state['dir_uploaded_images_small'] = os.path.join(st.session_state.dir_home,'uploads_small')
+    validate_dir(os.path.join(st.session_state.dir_home,'uploads_small'))
         ind_small = 0
         for uploaded_file in uploaded_files:
             if SAFE.check_for_inappropriate_content(uploaded_file):
                 clear_image_uploads()
                 report_violation(uploaded_file.name, is_hf=st.session_state['is_hf'])
                 st.error("Warning: You uploaded an image that violates our terms of service.")
+                return True
             # Determine the file type
             if uploaded_file.name.lower().endswith('.pdf'):
                 # Handle PDF files
+                file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file)
                 # Convert each page of the PDF to an image
                 n_pages = convert_pdf_to_jpg(file_path, st.session_state['dir_uploaded_images'], dpi=200)#st.session_state.config['leafmachine']['project']['dir_images_local'])
                 # Update the input list for each page image
                             # Optionally, create a thumbnail for the gallery
                             img = Image.open(jpg_file_path)
                             img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
+                            try:
                                 file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], file_name, img)
+                            except:
                                 file_path_small = save_uploaded_file_local(st.session_state['dir_uploaded_images_small'],st.session_state['dir_uploaded_images_small'], file_name, img)
                             st.session_state['input_list_small'].append(file_path_small)
             else:
                 ind_small += 1
                 # Handle JPG/JPEG files (existing process)
+                file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file)
                 st.session_state['input_list'].append(file_path)
+                if ind_small < MAX_GALLERY_IMAGES +5:
+                    img = Image.open(file_path)
+                    img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
+                    file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], uploaded_file, img)
+                    st.session_state['input_list_small'].append(file_path_small)
         # After processing all files
         st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
     with col_right:
         if st.session_state.is_hf:
+            result = handle_image_upload_and_gallery_hf(uploaded_files)
         else:
             st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
             st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
+    # st.header('LLM Version')
+    # col_llm_1, col_llm_2 = st.columns([4,2])
+    # with col_llm_1:
+    #     GUI_MODEL_LIST = ModelMaps.get_models_gui_list()
+    #     st.session_state.config['leafmachine']['LLM_version'] = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(st.session_state.config['leafmachine'].get('LLM_version', ModelMaps.MODELS_GUI_DEFAULT)))
+    # Determine the default family based on the default model
+    default_model = ModelMaps.MODELS_GUI_DEFAULT
+    default_family = None
+    for family, models in ModelMaps.MODEL_FAMILY.items():
+        if default_model in models:
+            default_family = family
+            break
+    st.header("LLM Version")
+    col_llm_1, col_llm_2 = st.columns([4, 2])
     with col_llm_1:
+        # Step 1: Select Model Family with default family pre-selected
+        family_list = list(ModelMaps.MODEL_FAMILY.keys())
+        selected_family = st.selectbox("Select Model Family", family_list, index=family_list.index(default_family) if default_family else 0)
+        # Step 2: Display Models based on selected family
+        GUI_MODEL_LIST = ModelMaps.get_models_gui_list_family(selected_family)
+        # Ensure the selected model is part of the current family; if not, use the default of this family
+        selected_model_default = st.session_state.config['leafmachine'].get('LLM_version', default_model)
+        if selected_model_default not in GUI_MODEL_LIST:
+            selected_model_default = GUI_MODEL_LIST[0]
+        selected_model = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(selected_model_default))
+        # Update the session state with the selected model
+        st.session_state.config['leafmachine']['LLM_version'] = selected_model
         st.markdown("""
 Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
 - Any Mistral model e.g., `Mistral Large`
+def adjust_ocr_options_based_on_capability(capability_score, model_name='llava'):
+    if model_name == 'llava':
+        llava_models_requirements = {
+            "liuhaotian/llava-v1.6-mistral-7b": {"full": 18, "4bit": 9},
+            "liuhaotian/llava-v1.6-34b": {"full": 70, "4bit": 25},
+            "liuhaotian/llava-v1.6-vicuna-13b": {"full": 33, "4bit": 15},
+            "liuhaotian/llava-v1.6-vicuna-7b": {"full": 20, "4bit": 10},
+        }
+        if capability_score == 'no_gpu':
+            return False
+        else:
+            capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
+            supported_models = [model for model, reqs in llava_models_requirements.items()
+                                if reqs["full"] <= capability_score_n or reqs["4bit"] <= capability_score_n]
+            # If no models are supported, disable the LLaVA option
+            if not supported_models:
+                # Assuming the LLaVA option is the last in your list
+                return False  # Indicate LLaVA is not supported
+            return True  # Indicate LLaVA is supported
+    elif model_name == 'florence-2':
+        florence_models_requirements = {
+            "microsoft/Florence-2-large": {"full": 16,},
+            "microsoft/Florence-2-base": {"full": 12,},
+        }
+        if capability_score == 'no_gpu':
+            return False
+        else:
+            capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
+            supported_models = [model for model, reqs in florence_models_requirements.items()
+                                if reqs["full"] <= capability_score_n]
+            # If no models are supported, disable the model option
+            if not supported_models:
+                # Assuming the model option is the last in your list
+                return False  # Indicate model is not supported
+            return True  # Indicate model is supported
     c1, c2 = st.columns([4,4])
+    with c2:
+        st.subheader("Local Methods")
+        st.write("Local methods are free, but require a capable GPU. ")
+        # Check if LLaVA models are supported based on capability score
+        llava_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score, model_name='llava')
+        florence_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score, model_name='florence-2')
+        if llava_supported:
+            st.success("LLaVA models are supported on this computer. A GPU with at least 12 GB of VRAM is available.")
+        else:
+            st.warning("LLaVA models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
+        if llava_supported:
+            st.success("Florence-2 models are supported on this computer. A GPU with at least 12 GB of VRAM is available.")
+        else:
+            st.warning("Florence-2 models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
     demo_text_h = f"Google_OCR_Handwriting:\nHERBARIUM OF MARCUS W. LYON , JR . Tracaulon sagittatum Indiana : Porter Co. incal Springs edge wet subdunal woods 1927 TX 11 Ilowers pink UNIVERSITE HERBARIUM MICH University of Michigan Herbarium 1439649 copyright reserved PERSICARIA FEB 2 6 1965 cm "
     demo_text_tr = f"trOCR:\nherbarium of marcus w. lyon jr. : : : tracaulon sagittatum indiana porter co. incal springs TX 11 Ilowers pink  1439649 copyright reserved D H U Q "
     demo_text_trh = demo_text_h + '\n' + demo_text_tr
     demo_text_trp = demo_text_p + '\n' + demo_text_tr
+    options = ["Google Vision Handwritten", "Google Vision Printed", "Florence-2", "GPT-4o-mini", "CRAFT + trOCR","LLaVA", ]
     options_llava = ["llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",]
     options_llava_bit = ["full", "4bit",]
     captions_llava = [
     default_index_llava_bit = 0
     with c1:
         st.subheader("API Methods (Google Vision)")
+        st.write("Using APIs for OCR allows VoucherVision to run on most computers. You can use multiple OCR engines simultaneously.")
         st.session_state.config['leafmachine']['project']['double_OCR'] = st.checkbox(label="Send 2 copies of the OCR to the LLM",
                                                                                       help="This can help the LLMs focus attention on the OCR and not get lost in the longer instruction text",
             "CRAFT + trOCR": 'CRAFT',
             "LLaVA": 'LLaVA',
             "Florence-2": 'Florence-2',
+            "GPT-4o-mini": "GPT-4o-mini",
         }
         # Map selected options to their corresponding internal representations
         st.session_state.config['leafmachine']['project']['OCR_option'] = selected_OCR_options
     if 'CRAFT' in selected_OCR_options:
+        st.subheader('Options for :blue[CRAFT + trOCR]')
+        st.write("Supplement Google Vision OCR with :blue[trOCR] (handwriting OCR) using `microsoft/trocr-base-handwritten`. This option requires Google Vision API and a GPU.")
+        if 'CRAFT' in selected_OCR_options:
+            do_use_trOCR = st.checkbox("Enable :blue[trOCR]", value=True, key="Enable trOCR1",disabled=True)#,disabled=st.session_state['lacks_GPU'])
+        else:
+            do_use_trOCR = st.checkbox("Enable :blue[trOCR]", value=st.session_state.config['leafmachine']['project']['do_use_trOCR'],key="Enable trOCR2")#,disabled=st.session_state['lacks_GPU'])
+            st.session_state.config['leafmachine']['project']['do_use_trOCR'] = do_use_trOCR
+        if do_use_trOCR:
+            # st.session_state.config['leafmachine']['project']['trOCR_model_path'] = "microsoft/trocr-large-handwritten"
+            default_trOCR_model_path = st.session_state.config['leafmachine']['project']['trOCR_model_path']
+            user_input_trOCR_model_path = st.text_input(":blue[trOCR] Hugging Face model path. MUST be a fine-tuned version of 'microsoft/trocr-base-handwritten' or 'microsoft/trocr-large-handwritten', or a microsoft :blue[trOCR] model.", value=default_trOCR_model_path)
+            if st.session_state.config['leafmachine']['project']['trOCR_model_path'] != user_input_trOCR_model_path:
+                is_valid_mp = is_valid_huggingface_model_path(user_input_trOCR_model_path)
+                if not is_valid_mp:
+                    st.error(f"The Hugging Face model path {user_input_trOCR_model_path} is not valid. Please revise.")
+                else:
+                    st.session_state.config['leafmachine']['project']['trOCR_model_path'] = user_input_trOCR_model_path
     if "Florence-2" in selected_OCR_options:
+        st.subheader('Options for :green[Florence-2]')
         default_florence_model_path = st.session_state.config['leafmachine']['project']['florence_model_path']
+        st.session_state.config['leafmachine']['project']['florence_model_path'] = st.radio(
+            "Select :green[Florence-2] version.",
+            ["microsoft/Florence-2-large", "microsoft/Florence-2-base", ],
+            captions=["'large' requires at least 16GB of VRAM", "'base' requires 12GB of VRAM."])
+    if "GPT-4o-mini" in selected_OCR_options:
+        st.subheader('Options for :violet[GPT-4o-mini]')
+        default_resolution = st.session_state.config['leafmachine']['project']['OCR_GPT_4o_mini_resolution']
+        st.session_state.config['leafmachine']['project']['OCR_GPT_4o_mini_resolution'] = st.radio(
+            "Select level of detail for :violet[GPT-4o-mini] OCR. We only recommend 'high' detail in most scenarios.",
+            ["high", "low", ],
+            captions=["$0.50 per 1,000", "\$5 - \$10 per 1,000"])
     if 'LLaVA' in selected_OCR_options:
+        st.subheader('Options for :red[LLaVA]')
         OCR_option_llava = st.radio(
+            "Select the :red[LLaVA] version",
             options_llava,
             index=default_index_llava,
             help="",captions=captions_llava,
         st.session_state.config['leafmachine']['project']['OCR_option_llava'] = OCR_option_llava
         OCR_option_llava_bit = st.radio(
+            "Select the :red[LLaVA] quantization level",
             options_llava_bit,
             index=default_index_llava_bit,
             help="",captions=captions_llava_bit,
         )
         st.session_state.config['leafmachine']['project']['OCR_option_llava_bit'] = OCR_option_llava_bit
+    st.write('---')
         # st.image(st.session_state["demo_overlay"], caption='OCR Overlay Images', output_format = "JPEG")
 def content_collage_overlay():
     col_collage, col_overlay = st.columns([4,4])

vouchervision/OCR_Florence_2.py CHANGED Viewed

@@ -6,12 +6,18 @@ import matplotlib.patches as patches
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 import warnings
-from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
-from vouchervision.utils_LLM import SystemLoadMonitor
 warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
 class FlorenceOCR:
     def __init__(self, logger, model_id='microsoft/Florence-2-large'):
         self.MAX_TOKENS = 1024
         self.logger = logger
@@ -25,7 +31,15 @@ class FlorenceOCR:
         # self.model_id_clean = "mistralai/Mistral-7B-v0.3"
         self.model_id_clean = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
         self.tokenizer_clean = AutoTokenizer.from_pretrained(self.model_id_clean)
-        self.model_clean = AutoModelForCausalLM.from_pretrained(self.model_id_clean)
     def ocr_florence(self, image, task_prompt='<OCR>', text_input=None):
@@ -54,34 +68,46 @@ class FlorenceOCR:
             num_beams=3,
         )
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-        parsed_answer_dirty = self.processor.post_process_generation(
             generated_text,
             task=task_prompt,
             image_size=(image.width, image.height)
         )
-        inputs = self.tokenizer_clean(f"Insert spaces into this text to make all the words valid. This text contains scientific names of plants, locations, habitat, coordinate words: {parsed_answer_dirty[task_prompt]}", return_tensors="pt")
-        inputs = {key: value.to(self.model_clean.device) for key, value in inputs.items()}
-        outputs = self.model_clean.generate(**inputs, max_new_tokens=self.MAX_TOKENS)
-        parsed_answer = self.tokenizer_clean.decode(outputs[0], skip_special_tokens=True)
-        print(parsed_answer_dirty)
-        print(parsed_answer)
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
-        return parsed_answer, parsed_answer_dirty[task_prompt], parsed_answer_dirty, usage_report
 def main():
-    img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
-    # img = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
     image = Image.open(img_path)
-    ocr = FlorenceOCR(logger = None)
-    results_text, results, usage_report = ocr.ocr_florence(image, task_prompt='<OCR>', text_input=None)
     print(results_text)
 if __name__ == '__main__':

 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 import warnings
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+try:
+    from vouchervision.utils_LLM import SystemLoadMonitor
+except:
+    from utils_LLM import SystemLoadMonitor
 warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
 class FlorenceOCR:
+    # def __init__(self, logger, model_id='microsoft/Florence-2-base'):
     def __init__(self, logger, model_id='microsoft/Florence-2-large'):
         self.MAX_TOKENS = 1024
         self.logger = logger
         # self.model_id_clean = "mistralai/Mistral-7B-v0.3"
         self.model_id_clean = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
         self.tokenizer_clean = AutoTokenizer.from_pretrained(self.model_id_clean)
+        # Configuring the BitsAndBytesConfig for quantization
+        quant_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            quant_method="bnb",
+        )
+        self.model_clean = AutoModelForCausalLM.from_pretrained(
+            self.model_id_clean,
+            quantization_config=quant_config,
+            low_cpu_mem_usage=True,)
     def ocr_florence(self, image, task_prompt='<OCR>', text_input=None):
             num_beams=3,
         )
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        parsed_answer_dict = self.processor.post_process_generation(
             generated_text,
             task=task_prompt,
             image_size=(image.width, image.height)
         )
+        parsed_answer_text = parsed_answer_dict[task_prompt]
+        # Prepare input for the second model
+        inputs_clean = self.tokenizer_clean(
+            f"Insert spaces into this text to make all the words valid. This text contains scientific names of plants, locations, habitat, coordinate words: {parsed_answer_text}",
+            return_tensors="pt"
+        )
+        inputs_clean = {key: value.to(self.model_clean.device) for key, value in inputs_clean.items()}
+        outputs_clean = self.model_clean.generate(**inputs_clean, max_new_tokens=self.MAX_TOKENS)
+        text_with_spaces = self.tokenizer_clean.decode(outputs_clean[0], skip_special_tokens=True)
+        # Extract only the LLM response from the decoded text
+        response_start = text_with_spaces.find(parsed_answer_text)
+        if response_start != -1:
+            text_with_spaces = text_with_spaces[response_start + len(parsed_answer_text):].strip()
+        print(text_with_spaces)
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
+        return text_with_spaces, parsed_answer_text, parsed_answer_dict, usage_report
 def main():
+    # img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
+    img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
     image = Image.open(img_path)
+    # ocr = FlorenceOCR(logger = None, model_id='microsoft/Florence-2-base')
+    ocr = FlorenceOCR(logger = None, model_id='microsoft/Florence-2-large')
+    results_text, results_all, results_dirty, usage_report = ocr.ocr_florence(image, task_prompt='<OCR>', text_input=None)
     print(results_text)
 if __name__ == '__main__':

vouchervision/OCR_GPT4oMini.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os, base64, requests, yaml
+from PIL import Image
+from openai import OpenAI
+from general_utils import calculate_cost
+# PROMPT = """Please perform OCR on this scientific image and extract the printed and handwritten text verbatim. Do not explain your answer, only return the verbatim text in this JSON dictionary format: {'printed_text': '', 'handwritten_text': ''}"""
+PROMPT = """Please perform OCR on this scientific image and extract all of the words and text verbatim. Do not explain your answer, only return the verbatim text:"""
+class GPT4oMiniOCR:
+    def __init__(self, api_key):
+        self.api_key = api_key
+        self.path_api_cost = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'api_cost', 'api_cost.yaml')
+    def encode_image(self, image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    def ocr_gpt4o(self, image_path, resolution="low", max_tokens=512):
+        # Getting the base64 string
+        base64_image = self.encode_image(image_path)
+        headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": "gpt-4o-mini",
+            "messages": [
+                {
+                "role": "user",
+                "content": [
+                    {
+                    "type": "text",
+                    "text": PROMPT,
+                    },
+                    {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": resolution,
+                    }
+                    }
+                ]
+                }
+            ],
+            "max_tokens": max_tokens
+            }
+        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+        response_json  = response.json()
+        if "choices" in response_json :
+            parsed_answer = response_json["choices"][0]["message"]["content"]
+        else:
+            parsed_answer = None
+        usage_report = response_json.get('usage', {})
+        tokens_in = usage_report["prompt_tokens"]
+        tokens_out = usage_report["completion_tokens"]
+        total_cost = calculate_cost('GPT_4o_mini_2024_07_18', self.path_api_cost, tokens_in, tokens_out)
+        cost_in, cost_out, total_cost, rates_in, rates_out = total_cost
+        return parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out
+def main():
+    # img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
+    img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
+    # $env:OPENAI_API_KEY="KEY"
+    API_KEY = "sk-proj-DxHlMH1H6jZzs8V12qbLT3BlbkFJIJnAVzt4kquOfhGURGW0"
+    ocr = GPT4oMiniOCR(API_KEY)
+    parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="low", max_tokens=512)
+    print(f"Parsed Answer: {parsed_answer}")
+    print(f"Total Cost: {total_cost}")
+    parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="high", max_tokens=512)
+    print(f"Parsed Answer: {parsed_answer}")
+    print(f"Total Cost: {total_cost}")
+if __name__ == '__main__':
+    main()

vouchervision/OCR_google_cloud_vision.py CHANGED Viewed

@@ -8,6 +8,7 @@ import colorsys
 from tqdm import tqdm
 from google.oauth2 import service_account
 from OCR_Florence_2 import FlorenceOCR
 ### LLaVA should only be installed if the user will actually use it.
 ### It requires the most recent pytorch/Python and can mess with older systems
@@ -56,6 +57,11 @@ class OCREngine:
         self.OCR_JSON_to_file = {}
         self.hand_cleaned_text = None
         self.hand_organized_text = None
         self.hand_bounds = None
@@ -84,6 +90,7 @@ class OCREngine:
         self.trOCR_characters = None
         self.set_client()
         self.init_florence()
         self.init_craft()
         self.multimodal_prompt = """I need you to transcribe all of the text in this image.
@@ -125,6 +132,10 @@ class OCREngine:
         if 'Florence-2' in self.OCR_option:
             self.Florence = FlorenceOCR(logger=self.logger, model_id=self.cfg['leafmachine']['project']['florence_model_path'])
     def init_llava(self):
         if 'LLaVA' in self.OCR_option:
             from vouchervision.OCR_llava import OCRllava
@@ -701,7 +712,7 @@ class OCREngine:
         if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
             if self.json_report:
-                self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} transcription :construction:')
             image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
             self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
@@ -716,7 +727,7 @@ class OCREngine:
         if 'Florence-2' in self.OCR_option: # This option does not produce an OCR helper image
             if self.json_report:
-                self.json_report.set_text(text_main=f'Working on Florence-2 [{self.Florence.model_id}] transcription :construction:')
             self.logger.info(f"Florence-2 Usage Report for Model [{self.Florence.model_id}]")
             results_text, results_text_dirty, results, usage_report = self.Florence.ocr_florence(self.path, task_prompt='<OCR>', text_input=None)
@@ -728,6 +739,21 @@ class OCREngine:
             else:
                 self.OCR = self.OCR + f"\nFlorence-2 OCR:\n{results_text}"
         if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
             if 'normal' in self.OCR_option:
                 if self.double_OCR:
@@ -824,48 +850,44 @@ class SafetyCheck():
         else:
             self.client = vision.ImageAnnotatorClient(credentials=self.get_google_credentials())
     def get_google_credentials(self):
         creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
         credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
         return credentials
     def check_for_inappropriate_content(self, file_stream):
-        try:
-            LEVEL = 2
-            # content = file_stream.read()
-            file_stream.seek(0)  # Reset file stream position to the beginning
-            content = file_stream.read()
-            image = vision.Image(content=content)
-            response = self.client.safe_search_detection(image=image)
-            safe = response.safe_search_annotation
-            likelihood_name = (
-                "UNKNOWN",
-                "VERY_UNLIKELY",
-                "UNLIKELY",
-                "POSSIBLE",
-                "LIKELY",
-                "VERY_LIKELY",
-            )
-            print("Safe search:")
-            print(f"    adult*: {likelihood_name[safe.adult]}")
-            print(f"    medical*: {likelihood_name[safe.medical]}")
-            print(f"    spoofed: {likelihood_name[safe.spoof]}")
-            print(f"    violence*: {likelihood_name[safe.violence]}")
-            print(f"    racy: {likelihood_name[safe.racy]}")
-            # Check the levels of adult, violence, racy, etc. content.
-            if (safe.adult > LEVEL or
-                safe.medical > LEVEL or
-                # safe.spoof > LEVEL or
-                safe.violence > LEVEL #or
-                # safe.racy > LEVEL
-                ):
-                print("Found violation")
-                return True  # The image violates safe search guidelines.
-            print("Found NO violation")
-            return False  # The image is considered safe.
-        except:
-            return False  # The image is considered safe. TEMPOROARY FIX TODO

 from tqdm import tqdm
 from google.oauth2 import service_account
 from OCR_Florence_2 import FlorenceOCR
+from OCR_GPT4oMini import GPT4oMiniOCR
 ### LLaVA should only be installed if the user will actually use it.
 ### It requires the most recent pytorch/Python and can mess with older systems
         self.OCR_JSON_to_file = {}
+        # for paid vLM OCR like GPT-vision
+        self.cost = 0.0
+        self.tokens_in = 0
+        self.tokens_out = 0
         self.hand_cleaned_text = None
         self.hand_organized_text = None
         self.hand_bounds = None
         self.trOCR_characters = None
         self.set_client()
         self.init_florence()
+        self.init_gpt_4o_mini()
         self.init_craft()
         self.multimodal_prompt = """I need you to transcribe all of the text in this image.
         if 'Florence-2' in self.OCR_option:
             self.Florence = FlorenceOCR(logger=self.logger, model_id=self.cfg['leafmachine']['project']['florence_model_path'])
+    def init_gpt_4o_mini(self):
+        if 'GPT-4o-mini' in self.OCR_option:
+            self.GPTmini = GPT4oMiniOCR(api_key = os.getenv('OPENAI_API_KEY'))
     def init_llava(self):
         if 'LLaVA' in self.OCR_option:
             from vouchervision.OCR_llava import OCRllava
         if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
             if self.json_report:
+                self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} OCR :construction:')
             image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
             self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
         if 'Florence-2' in self.OCR_option: # This option does not produce an OCR helper image
             if self.json_report:
+                self.json_report.set_text(text_main=f'Working on Florence-2 [{self.Florence.model_id}] OCR :construction:')
             self.logger.info(f"Florence-2 Usage Report for Model [{self.Florence.model_id}]")
             results_text, results_text_dirty, results, usage_report = self.Florence.ocr_florence(self.path, task_prompt='<OCR>', text_input=None)
             else:
                 self.OCR = self.OCR + f"\nFlorence-2 OCR:\n{results_text}"
+        if 'GPT-4o-mini' in self.OCR_option: # This option does not produce an OCR helper image
+            if self.json_report:
+                self.json_report.set_text(text_main=f'Working on GPT-4o-mini OCR :construction:')
+            self.logger.info(f"GPT-4o-mini Usage Report")
+            results_text, cost_in, cost_out, total_cost, rates_in, rates_out, self.tokens_in, self.tokens_out = self.GPTmini.ocr_gpt4o(self.path, resolution=self.cfg['leafmachine']['project']['OCR_GPT_4o_mini_resolution'], max_tokens=512)
+            self.cost += total_cost
+            self.OCR_JSON_to_file['OCR_GPT_4o_mini'] = results_text
+            if self.double_OCR:
+                self.OCR = self.OCR + f"\nGPT-4o-mini OCR:\n{results_text}" + f"\nGPT-4o-mini OCR:\n{results_text}"
+            else:
+                self.OCR = self.OCR + f"\nGPT-4o-mini OCR:\n{results_text}"
         if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
             if 'normal' in self.OCR_option:
                 if self.double_OCR:
         else:
             self.client = vision.ImageAnnotatorClient(credentials=self.get_google_credentials())
     def get_google_credentials(self):
         creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
         credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
         return credentials
     def check_for_inappropriate_content(self, file_stream):
+        LEVEL = 2
+        content = file_stream.read()
+        image = vision.Image(content=content)
+        response = self.client.safe_search_detection(image=image)
+        safe = response.safe_search_annotation
+        likelihood_name = (
+            "UNKNOWN",
+            "VERY_UNLIKELY",
+            "UNLIKELY",
+            "POSSIBLE",
+            "LIKELY",
+            "VERY_LIKELY",
+        )
+        print("Safe search:")
+        print(f"    adult*: {likelihood_name[safe.adult]}")
+        print(f"    medical*: {likelihood_name[safe.medical]}")
+        print(f"    spoofed: {likelihood_name[safe.spoof]}")
+        print(f"    violence*: {likelihood_name[safe.violence]}")
+        print(f"    racy: {likelihood_name[safe.racy]}")
+        # Check the levels of adult, violence, racy, etc. content.
+        if (safe.adult > LEVEL or
+            safe.medical > LEVEL or
+            # safe.spoof > LEVEL or
+            safe.violence > LEVEL #or
+            # safe.racy > LEVEL
+            ):
+            print("Found violation")
+            return True  # The image violates safe search guidelines.
+        print("Found NO violation")
+        return False  # The image is considered safe.

vouchervision/VoucherVision_Config_Builder.py CHANGED Viewed

@@ -42,6 +42,7 @@ def build_VV_config(loaded_cfg=None):
         OCR_option = 'hand'
         OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
         OCR_option_llava_bit = 'full' # full or 4bit
         double_OCR = False
         tool_GEO = True
@@ -73,7 +74,7 @@ def build_VV_config(loaded_cfg=None):
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
                         prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
-                        OCR_option_llava_bit, double_OCR, save_cropped_annotations,
                         tool_GEO, tool_WFO, tool_wikipedia,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
     else:
@@ -95,6 +96,7 @@ def build_VV_config(loaded_cfg=None):
         OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
         OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
         OCR_option_llava_bit  = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
         double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
         tool_GEO = loaded_cfg['leafmachine']['project']['tool_GEO']
@@ -122,7 +124,7 @@ def build_VV_config(loaded_cfg=None):
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
                         prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
-                        OCR_option_llava_bit, double_OCR, save_cropped_annotations,
                         tool_GEO, tool_WFO, tool_wikipedia,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
@@ -131,7 +133,7 @@ def assemble_config(dir_home, run_name, dir_images_local,dir_output,
                     prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                     path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
                     prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
-                    OCR_option_llava_bit, double_OCR, save_cropped_annotations,
                     tool_GEO, tool_WFO, tool_wikipedia,
                     check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
@@ -183,6 +185,7 @@ def assemble_config(dir_home, run_name, dir_images_local,dir_output,
         'OCR_option': OCR_option,
         'OCR_option_llava': OCR_option_llava,
         'OCR_option_llava_bit': OCR_option_llava_bit,
         'double_OCR': double_OCR,
         'pdf_conversion_dpi': pdf_conversion_dpi,
         'tool_GEO': tool_GEO,

         OCR_option = 'hand'
         OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
         OCR_option_llava_bit = 'full' # full or 4bit
+        OCR_GPT_4o_mini_resolution = 'high'
         double_OCR = False
         tool_GEO = True
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
                         prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
+                        OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
                         tool_GEO, tool_WFO, tool_wikipedia,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
     else:
         OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
         OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
         OCR_option_llava_bit  = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
+        OCR_GPT_4o_mini_resolution = loaded_cfg['leafmachine']['project']['OCR_GPT_4o_mini_resolution']
         double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
         tool_GEO = loaded_cfg['leafmachine']['project']['tool_GEO']
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
                         prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
+                        OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
                         tool_GEO, tool_WFO, tool_wikipedia,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
                     prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                     path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
                     prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
+                    OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
                     tool_GEO, tool_WFO, tool_wikipedia,
                     check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
         'OCR_option': OCR_option,
         'OCR_option_llava': OCR_option_llava,
         'OCR_option_llava_bit': OCR_option_llava_bit,
+        'OCR_GPT_4o_mini_resolution': OCR_GPT_4o_mini_resolution,
         'double_OCR': double_OCR,
         'pdf_conversion_dpi': pdf_conversion_dpi,
         'tool_GEO': tool_GEO,

vouchervision/general_utils.py CHANGED Viewed

@@ -10,7 +10,11 @@ import concurrent.futures
 from time import perf_counter
 import torch
-from vouchervision.model_maps import ModelMaps
 '''
 TIFF --> DNG
@@ -65,12 +69,12 @@ def add_to_expense_report(dir_home, data):
         # If the file does not exist, write the header first
         if not file_exists:
-            writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out',])
         # Write the data row
         writer.writerow(data)
-def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, total_tokens_out, n_images, dir_home, logger):
     if path_api_cost:
         LLM_version = ModelMaps.get_version_mapping_cost(LLM_version0)
@@ -78,16 +82,18 @@ def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, t
         csv_file_path = os.path.join(Dirs.path_cost, Dirs.run_name + '.csv')
         cost_in, cost_out, total_cost, rate_in, rate_out = calculate_cost(LLM_version, path_api_cost, total_tokens_in, total_tokens_out)
         # The data to be written to the CSV file
-        data = [Dirs.run_name, get_datetime(),LLM_version, total_cost, n_images, total_tokens_in, total_tokens_out, rate_in, rate_out, cost_in, cost_out,]
         # Open the file in write mode
         with open(csv_file_path, mode='w', newline='') as file:
             writer = csv.writer(file)
             # Write the header
-            writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out',])
             # Write the data
             writer.writerow(data)
@@ -119,6 +125,11 @@ def summarize_expense_report(path_expense_report):
     cost_in_sum = 0
     cost_out_sum = 0
     n_images_sum = 0
     api_version_counts = Counter()
     # Try to read the CSV file into a DataFrame
@@ -128,7 +139,7 @@ def summarize_expense_report(path_expense_report):
         # Process each row in the DataFrame
         for index, row in df.iterrows():
             run_count += 1
-            total_cost_sum += row['total_cost']
             tokens_in_sum += row['tokens_in']
             tokens_out_sum += row['tokens_out']
             rate_in_sum += row['rate_in']
@@ -136,6 +147,9 @@ def summarize_expense_report(path_expense_report):
             cost_in_sum += row['cost_in']
             cost_out_sum += row['cost_out']
             n_images_sum += row['n_images']
             api_version_counts[row['api_version']] += 1
     except FileNotFoundError:
@@ -163,6 +177,9 @@ def summarize_expense_report(path_expense_report):
         'rate_out_sum': rate_out_sum,
         'cost_in_sum': cost_in_sum,
         'cost_out_sum': cost_out_sum,
         'n_images_sum':n_images_sum,
         'api_version_percentages': api_version_percentages,
         'cost_per_image': cost_per_image_dict

 from time import perf_counter
 import torch
+try:
+    from vouchervision.model_maps import ModelMaps
+except:
+    from model_maps import ModelMaps
 '''
 TIFF --> DNG
         # If the file does not exist, write the header first
         if not file_exists:
+            writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out','ocr_cost','ocr_tokens_in', 'ocr_tokens_out',])
         # Write the data row
         writer.writerow(data)
+def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out, n_images, dir_home, logger):
     if path_api_cost:
         LLM_version = ModelMaps.get_version_mapping_cost(LLM_version0)
         csv_file_path = os.path.join(Dirs.path_cost, Dirs.run_name + '.csv')
         cost_in, cost_out, total_cost, rate_in, rate_out = calculate_cost(LLM_version, path_api_cost, total_tokens_in, total_tokens_out)
+        total_cost += OCR_cost
         # The data to be written to the CSV file
+        data = [Dirs.run_name, get_datetime(),LLM_version, total_cost, n_images, total_tokens_in, total_tokens_out, rate_in, rate_out, cost_in, cost_out,OCR_cost, OCR_tokens_in, OCR_tokens_out,]
         # Open the file in write mode
         with open(csv_file_path, mode='w', newline='') as file:
             writer = csv.writer(file)
             # Write the header
+            writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out','ocr_cost','ocr_tokens_in', 'ocr_tokens_out'])
             # Write the data
             writer.writerow(data)
     cost_in_sum = 0
     cost_out_sum = 0
     n_images_sum = 0
+    # ,'ocr_cost','ocr_tokens_in', 'ocr_tokens_out'
+    ocr_cost_sum = 0
+    ocr_tokens_in_sum = 0
+    ocr_tokens_out_sum = 0
     api_version_counts = Counter()
     # Try to read the CSV file into a DataFrame
         # Process each row in the DataFrame
         for index, row in df.iterrows():
             run_count += 1
+            total_cost_sum += row['total_cost'] + row['ocr_cost']
             tokens_in_sum += row['tokens_in']
             tokens_out_sum += row['tokens_out']
             rate_in_sum += row['rate_in']
             cost_in_sum += row['cost_in']
             cost_out_sum += row['cost_out']
             n_images_sum += row['n_images']
+            ocr_cost_sum += row['ocr_cost']
+            ocr_tokens_in_sum += row['ocr_tokens_in']
+            ocr_tokens_out_sum += row['ocr_tokens_out']
             api_version_counts[row['api_version']] += 1
     except FileNotFoundError:
         'rate_out_sum': rate_out_sum,
         'cost_in_sum': cost_in_sum,
         'cost_out_sum': cost_out_sum,
+        'ocr_cost_sum': ocr_cost_sum,
+        'ocr_tokens_in_sum': ocr_tokens_in_sum,
+        'ocr_tokens_out_sum': ocr_tokens_out_sum,
         'n_images_sum':n_images_sum,
         'api_version_percentages': api_version_percentages,
         'cost_per_image': cost_per_image_dict

vouchervision/model_maps.py CHANGED Viewed

@@ -40,23 +40,27 @@ class ModelMaps:
         'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05': '#bababa',  # Gray
     }
-    MODELS_OPENAI = ["GPT 4",
-                     "GPT 4 32k",
-                     "GPT 4o 2024-05-13", #GPT_4o_2024_05_13
-                     "GPT 4o mini 2024-07-18",
-                     "GPT 4 Turbo 2024-04-09",#GPT_4_TURBO_2024_04_09
-                     "GPT 4 Turbo 0125-preview",
-                     "GPT 4 Turbo 1106-preview",
-                     "GPT 3.5 Turbo",
-                     "GPT 3.5 Instruct",
-                     "Azure GPT 4",
                     #  "Azure GPT 4 32k",
                     #  "Azure GPT 4 Turbo 0125-preview",
                     #  "Azure GPT 4 Turbo 1106-preview",
                     #  "Azure GPT 3.5 Turbo",
                     #  "Azure GPT 3.5 Instruct",
-                     ]
     MODELS_GOOGLE = [
                     # "PaLM 2 text-bison@001",
@@ -79,7 +83,14 @@ class ModelMaps:
                     "LOCAL CPU Mistral 7B Instruct v0.2 GGUF",
                     'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05']
-    MODELS_GUI_DEFAULT = "Azure GPT 4" # "GPT 4 Turbo 1106-preview"
     version_mapping_cost = {
         'GPT 4 32k': 'GPT_4_32K',
@@ -316,7 +327,16 @@ class ModelMaps:
     @classmethod
     def get_models_gui_list(cls):
-        return cls.MODELS_LOCAL + cls.MODELS_GOOGLE + cls.MODELS_OPENAI + cls.MODELS_MISTRAL
     @classmethod
     def get_version_mapping_cost(cls, key):

         'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05': '#bababa',  # Gray
     }
+    MODELS_OPENAI = [
+                    "GPT 4o 2024-05-13", #GPT_4o_2024_05_13
+                    "GPT 4o mini 2024-07-18",
+                    "GPT 4 Turbo 2024-04-09",#GPT_4_TURBO_2024_04_09
+                    "GPT 4",
+                    "GPT 4 32k",
+                    "GPT 4 Turbo 0125-preview",
+                    "GPT 4 Turbo 1106-preview",
+                    "GPT 3.5 Turbo",
+                    "GPT 3.5 Instruct",
+                    ]
+    MODELS_OPENAI_AZURE = [
+                    "Azure GPT 4",
                     #  "Azure GPT 4 32k",
                     #  "Azure GPT 4 Turbo 0125-preview",
                     #  "Azure GPT 4 Turbo 1106-preview",
                     #  "Azure GPT 3.5 Turbo",
                     #  "Azure GPT 3.5 Instruct",
+                    ]
     MODELS_GOOGLE = [
                     # "PaLM 2 text-bison@001",
                     "LOCAL CPU Mistral 7B Instruct v0.2 GGUF",
                     'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05']
+    MODELS_GUI_DEFAULT = "Gemini 1.5 Flash" #"Azure GPT 4" # "GPT 4 Turbo 1106-preview"
+    MODEL_FAMILY = {
+                    'OpenAI': MODELS_OPENAI,
+                    'OpenAI Azure': MODELS_OPENAI_AZURE,
+                    'Google': MODELS_GOOGLE,
+                    'Mistral': MODELS_MISTRAL,
+                    'Local': MODELS_LOCAL}
     version_mapping_cost = {
         'GPT 4 32k': 'GPT_4_32K',
     @classmethod
     def get_models_gui_list(cls):
+        return cls.MODELS_LOCAL + cls.MODELS_GOOGLE + cls.MODELS_OPENAI + cls.MODELS_OPENAI_AZURE + cls.MODELS_MISTRAL
+    @classmethod
+    def get_models_gui_list_family(cls, family=None):
+        if family and family in cls.MODEL_FAMILY:
+            return cls.MODEL_FAMILY[family]
+        all_models = []
+        for family_models in cls.MODEL_FAMILY.values():
+            all_models.extend(family_models)
+        return all_models
     @classmethod
     def get_version_mapping_cost(cls, key):

vouchervision/utils_LLM.py CHANGED Viewed

@@ -8,11 +8,16 @@ import psutil
 import threading
 import torch
 from datetime import datetime
-from vouchervision.tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
-from vouchervision.tool_geolocate_HERE import validate_coordinates_here
-from vouchervision.tool_wikipedia import validate_wikipedia
 from concurrent.futures import ThreadPoolExecutor, as_completed
 def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
     # Define a function that will catch and return the results of your functions
@@ -179,15 +184,26 @@ class SystemLoadMonitor():
         }
-        self.logger.info(f"Inference Time: {round(self.inference_time,2)} seconds")
-        self.logger.info(f"Tool Time: {round(tool_time,2)} seconds")
-        self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
-        self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
         if self.has_GPU:
             report.update({'max_gpu_load': str(round(self.gpu_usage['max_load'] * 100, 2))})
             report.update({'max_gpu_vram_gb': str(round(self.gpu_usage['max_vram_usage'], 2))})
-            self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
-            self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
         else:
             report.update({'max_gpu_load': '0'})
             report.update({'max_gpu_vram_gb': '0'})

 import threading
 import torch
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
+try:
+    from vouchervision.tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
+    from vouchervision.tool_geolocate_HERE import validate_coordinates_here
+    from vouchervision.tool_wikipedia import validate_wikipedia
+except:
+    from tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
+    from tool_geolocate_HERE import validate_coordinates_here
+    from tool_wikipedia import validate_wikipedia
 def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
     # Define a function that will catch and return the results of your functions
         }
+        if self.logger:
+            self.logger.info(f"Inference Time: {round(self.inference_time,2)} seconds")
+            self.logger.info(f"Tool Time: {round(tool_time,2)} seconds")
+            self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
+            self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
+        else:
+            print(f"Inference Time: {round(self.inference_time,2)} seconds")
+            print(f"Tool Time: {round(tool_time,2)} seconds")
+            print(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
+            print(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
         if self.has_GPU:
             report.update({'max_gpu_load': str(round(self.gpu_usage['max_load'] * 100, 2))})
             report.update({'max_gpu_vram_gb': str(round(self.gpu_usage['max_vram_usage'], 2))})
+            if self.logger:
+                self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
+                self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
+            else:
+                print(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
+                print(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
         else:
             report.update({'max_gpu_load': '0'})
             report.update({'max_gpu_vram_gb': '0'})

vouchervision/utils_VoucherVision.py CHANGED Viewed

@@ -43,6 +43,10 @@ class VoucherVision():
         self.prompt_version = None
         self.is_hf = is_hf
         ### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
         self.config_vals_for_permutation = config_vals_for_permutation
@@ -649,11 +653,19 @@ class VoucherVision():
     def perform_OCR_and_save_results(self, image_index, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds):
         self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Starting OCR')
         # self.OCR - None
         ### Process_image() runs the OCR for text, handwriting, trOCR AND creates the overlay image
         ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
         ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
         self.OCR = ocr_google.OCR
         self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
         self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
@@ -774,7 +786,8 @@ class VoucherVision():
         self.update_progress_report_final(progress_report)
         final_JSON_response = self.parse_final_json_response(final_JSON_response)
-        return final_JSON_response, final_WFO_record, final_GEO_record, self.total_tokens_in, self.total_tokens_out
     ##################################################################################################################################
@@ -905,9 +918,9 @@ class VoucherVision():
             if is_real_run:
                 progress_report.update_overall(f"Transcribing Labels")
-            final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out = self.send_to_LLM(self.is_azure, progress_report, json_report, self.model_name)
-            return final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out
         except Exception as e:
             self.logger.error(f"LLM call failed in process_specimen_batch: {e}")

         self.prompt_version = None
         self.is_hf = is_hf
+        self.OCR_cost = 0.0
+        self.OCR_tokens_in = 0
+        self.OCR_tokens_out = 0
         ### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
         self.config_vals_for_permutation = config_vals_for_permutation
     def perform_OCR_and_save_results(self, image_index, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds):
         self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Starting OCR')
         # self.OCR - None
+        self.OCR_cost = 0.0
+        self.OCR_tokens_in = 0
+        self.OCR_tokens_out = 0
         ### Process_image() runs the OCR for text, handwriting, trOCR AND creates the overlay image
         ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
         ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
         self.OCR = ocr_google.OCR
+        self.OCR_cost = ocr_google.cost
+        self.OCR_tokens_in = ocr_google.tokens_in
+        self.OCR_tokens_out = ocr_google.tokens_out
         self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
         self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
         self.update_progress_report_final(progress_report)
         final_JSON_response = self.parse_final_json_response(final_JSON_response)
+        return final_JSON_response, final_WFO_record, final_GEO_record, self.total_tokens_in, self.total_tokens_out, self.OCR_cost, self.OCR_tokens_in, self.OCR_tokens_out
     ##################################################################################################################################
             if is_real_run:
                 progress_report.update_overall(f"Transcribing Labels")
+            final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out = self.send_to_LLM(self.is_azure, progress_report, json_report, self.model_name)
+            return final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out
         except Exception as e:
             self.logger.error(f"LLM call failed in process_specimen_batch: {e}")

vouchervision/vouchervision_main.py CHANGED Viewed

@@ -65,9 +65,9 @@ def voucher_vision(cfg_file_path, dir_home, path_custom_prompts, cfg_test, progr
     # Process labels
     Voucher_Vision = VoucherVision(cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf)
     n_images = len(Voucher_Vision.img_paths)
-    last_JSON_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out = Voucher_Vision.process_specimen_batch(progress_report, json_report, is_real_run)
-    total_cost = save_token_info_as_csv(Dirs, cfg['leafmachine']['LLM_version'], path_api_cost, total_tokens_in, total_tokens_out, n_images, dir_home, logger)
     t_overall_s = perf_counter()
     logger.name = 'Run Complete! :)'

     # Process labels
     Voucher_Vision = VoucherVision(cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf)
     n_images = len(Voucher_Vision.img_paths)
+    last_JSON_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out = Voucher_Vision.process_specimen_batch(progress_report, json_report, is_real_run)
+    total_cost = save_token_info_as_csv(Dirs, cfg['leafmachine']['LLM_version'], path_api_cost, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out, n_images, dir_home, logger)
     t_overall_s = perf_counter()
     logger.name = 'Run Complete! :)'