phyloforfun commited on
Commit
a145e37
1 Parent(s): 94dfdfd
app.py CHANGED
@@ -218,10 +218,10 @@ if 'dir_images_local_TEMP' not in st.session_state:
218
  st.session_state['dir_images_local_TEMP'] = False
219
  if 'dir_uploaded_images' not in st.session_state:
220
  st.session_state['dir_uploaded_images'] = os.path.join(st.session_state.dir_home,'uploads')
221
- validate_dir(st.session_state['dir_uploaded_images'])
222
  if 'dir_uploaded_images_small' not in st.session_state:
223
  st.session_state['dir_uploaded_images_small'] = os.path.join(st.session_state.dir_home,'uploads_small')
224
- validate_dir(st.session_state['dir_uploaded_images_small'])
225
 
226
 
227
 
@@ -264,16 +264,18 @@ def handle_image_upload_and_gallery_hf(uploaded_files):
264
 
265
  ind_small = 0
266
  for uploaded_file in uploaded_files:
 
267
  if SAFE.check_for_inappropriate_content(uploaded_file):
268
  clear_image_uploads()
269
  report_violation(uploaded_file.name, is_hf=st.session_state['is_hf'])
270
  st.error("Warning: You uploaded an image that violates our terms of service.")
 
271
 
272
 
273
  # Determine the file type
274
  if uploaded_file.name.lower().endswith('.pdf'):
275
  # Handle PDF files
276
- file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file, image=None)
277
  # Convert each page of the PDF to an image
278
  n_pages = convert_pdf_to_jpg(file_path, st.session_state['dir_uploaded_images'], dpi=200)#st.session_state.config['leafmachine']['project']['dir_images_local'])
279
  # Update the input list for each page image
@@ -288,27 +290,22 @@ def handle_image_upload_and_gallery_hf(uploaded_files):
288
  # Optionally, create a thumbnail for the gallery
289
  img = Image.open(jpg_file_path)
290
  img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
291
- if st.session_state['is_hf']:
292
  file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], file_name, img)
293
- else:
294
  file_path_small = save_uploaded_file_local(st.session_state['dir_uploaded_images_small'],st.session_state['dir_uploaded_images_small'], file_name, img)
295
  st.session_state['input_list_small'].append(file_path_small)
296
 
297
  else:
298
  ind_small += 1
299
  # Handle JPG/JPEG files (existing process)
300
- # file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file, image=None) ######### Yale TODO
301
- # file_path = os.path.join(st.session_state['dir_uploaded_images'], uploaded_file.name)
302
- image = Image.open(uploaded_file)
303
- file_path = os.path.join(st.session_state['dir_uploaded_images'], uploaded_file.name)
304
- image.save(file_path, "JPEG")
305
-
306
  st.session_state['input_list'].append(file_path)
307
- # if ind_small < MAX_GALLERY_IMAGES +5:
308
- # img = Image.open(file_path)
309
- # img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
310
- # file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], uploaded_file, img)
311
- # st.session_state['input_list_small'].append(file_path_small)
312
 
313
  # After processing all files
314
  st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
@@ -396,7 +393,7 @@ def content_input_images(col_left, col_right):
396
 
397
  with col_right:
398
  if st.session_state.is_hf:
399
- handle_image_upload_and_gallery_hf(uploaded_files)
400
 
401
  else:
402
  st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
@@ -1767,12 +1764,47 @@ def content_prompt_and_llm_version():
1767
  st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
1768
 
1769
 
1770
- st.header('LLM Version')
1771
- col_llm_1, col_llm_2 = st.columns([4,2])
1772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1773
  with col_llm_1:
1774
- GUI_MODEL_LIST = ModelMaps.get_models_gui_list()
1775
- st.session_state.config['leafmachine']['LLM_version'] = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(st.session_state.config['leafmachine'].get('LLM_version', ModelMaps.MODELS_GUI_DEFAULT)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1776
  st.markdown("""
1777
  Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
1778
  - Any Mistral model e.g., `Mistral Large`
@@ -1815,25 +1847,43 @@ def content_api_check():
1815
 
1816
 
1817
 
1818
- def adjust_ocr_options_based_on_capability(capability_score):
1819
- llava_models_requirements = {
1820
- "liuhaotian/llava-v1.6-mistral-7b": {"full": 18, "4bit": 9},
1821
- "liuhaotian/llava-v1.6-34b": {"full": 70, "4bit": 25},
1822
- "liuhaotian/llava-v1.6-vicuna-13b": {"full": 33, "4bit": 15},
1823
- "liuhaotian/llava-v1.6-vicuna-7b": {"full": 20, "4bit": 10},
1824
- }
1825
- if capability_score == 'no_gpu':
1826
- return False
1827
- else:
1828
- capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
1829
- supported_models = [model for model, reqs in llava_models_requirements.items()
1830
- if reqs["full"] <= capability_score_n or reqs["4bit"] <= capability_score_n]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1831
 
1832
- # If no models are supported, disable the LLaVA option
1833
- if not supported_models:
1834
- # Assuming the LLaVA option is the last in your list
1835
- return False # Indicate LLaVA is not supported
1836
- return True # Indicate LLaVA is supported
1837
 
1838
 
1839
 
@@ -1867,12 +1917,22 @@ def content_ocr_method():
1867
 
1868
  c1, c2 = st.columns([4,4])
1869
 
1870
- # Check if LLaVA models are supported based on capability score
1871
- llava_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score)
1872
- if llava_supported:
1873
- st.success("LLaVA models are supported on this computer")
1874
- else:
1875
- st.warning("LLaVA models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
 
 
 
 
 
 
 
 
 
 
1876
 
1877
  demo_text_h = f"Google_OCR_Handwriting:\nHERBARIUM OF MARCUS W. LYON , JR . Tracaulon sagittatum Indiana : Porter Co. incal Springs edge wet subdunal woods 1927 TX 11 Ilowers pink UNIVERSITE HERBARIUM MICH University of Michigan Herbarium 1439649 copyright reserved PERSICARIA FEB 2 6 1965 cm "
1878
  demo_text_tr = f"trOCR:\nherbarium of marcus w. lyon jr. : : : tracaulon sagittatum indiana porter co. incal springs TX 11 Ilowers pink 1439649 copyright reserved D H U Q "
@@ -1882,7 +1942,7 @@ def content_ocr_method():
1882
  demo_text_trh = demo_text_h + '\n' + demo_text_tr
1883
  demo_text_trp = demo_text_p + '\n' + demo_text_tr
1884
 
1885
- options = ["Google Vision Handwritten", "Google Vision Printed", "CRAFT + trOCR","LLaVA", "Florence-2"]
1886
  options_llava = ["llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",]
1887
  options_llava_bit = ["full", "4bit",]
1888
  captions_llava = [
@@ -1905,7 +1965,7 @@ def content_ocr_method():
1905
  default_index_llava_bit = 0
1906
  with c1:
1907
  st.subheader("API Methods (Google Vision)")
1908
- st.write("Using APIs for OCR allows VoucherVision to run on most computers.")
1909
 
1910
  st.session_state.config['leafmachine']['project']['double_OCR'] = st.checkbox(label="Send 2 copies of the OCR to the LLM",
1911
  help="This can help the LLMs focus attention on the OCR and not get lost in the longer instruction text",
@@ -1934,6 +1994,7 @@ def content_ocr_method():
1934
  "CRAFT + trOCR": 'CRAFT',
1935
  "LLaVA": 'LLaVA',
1936
  "Florence-2": 'Florence-2',
 
1937
  }
1938
 
1939
  # Map selected options to their corresponding internal representations
@@ -1943,45 +2004,52 @@ def content_ocr_method():
1943
  st.session_state.config['leafmachine']['project']['OCR_option'] = selected_OCR_options
1944
 
1945
 
1946
- with c2:
1947
- st.subheader("Local Methods")
1948
- st.write("Local methods are free, but require a capable GPU. ")
1949
-
1950
 
1951
- st.write("Supplement Google Vision OCR with trOCR (handwriting OCR) using `microsoft/trocr-base-handwritten`. This option requires Google Vision API and a GPU.")
1952
  if 'CRAFT' in selected_OCR_options:
1953
- do_use_trOCR = st.checkbox("Enable trOCR", value=True, key="Enable trOCR1",disabled=True)#,disabled=st.session_state['lacks_GPU'])
1954
- else:
1955
- do_use_trOCR = st.checkbox("Enable trOCR", value=st.session_state.config['leafmachine']['project']['do_use_trOCR'],key="Enable trOCR2")#,disabled=st.session_state['lacks_GPU'])
1956
- st.session_state.config['leafmachine']['project']['do_use_trOCR'] = do_use_trOCR
1957
-
1958
- if do_use_trOCR:
1959
- # st.session_state.config['leafmachine']['project']['trOCR_model_path'] = "microsoft/trocr-large-handwritten"
1960
- default_trOCR_model_path = st.session_state.config['leafmachine']['project']['trOCR_model_path']
1961
- user_input_trOCR_model_path = st.text_input("trOCR Hugging Face model path. MUST be a fine-tuned version of 'microsoft/trocr-base-handwritten' or 'microsoft/trocr-large-handwritten', or a microsoft trOCR model.", value=default_trOCR_model_path)
1962
- if st.session_state.config['leafmachine']['project']['trOCR_model_path'] != user_input_trOCR_model_path:
1963
- is_valid_mp = is_valid_huggingface_model_path(user_input_trOCR_model_path)
1964
- if not is_valid_mp:
1965
- st.error(f"The Hugging Face model path {user_input_trOCR_model_path} is not valid. Please revise.")
1966
- else:
1967
- st.session_state.config['leafmachine']['project']['trOCR_model_path'] = user_input_trOCR_model_path
 
 
 
1968
 
1969
 
1970
  if "Florence-2" in selected_OCR_options:
 
1971
  default_florence_model_path = st.session_state.config['leafmachine']['project']['florence_model_path']
1972
- user_input_florence_model_path = st.text_input("Florence-2 Hugging Face model path. MUST be a Florence-2 version based on 'microsoft/Florence-2-large' or similar.", value=default_florence_model_path)
1973
 
1974
- if st.session_state.config['leafmachine']['project']['florence_model_path'] != user_input_florence_model_path:
1975
- is_valid_mp = is_valid_huggingface_model_path(user_input_florence_model_path)
1976
- if not is_valid_mp:
1977
- st.error(f"The Hugging Face model path {user_input_florence_model_path} is not valid. Please revise.")
1978
- else:
1979
- st.session_state.config['leafmachine']['project']['florence_model_path'] = user_input_florence_model_path
 
 
 
 
 
 
 
1980
 
1981
 
1982
  if 'LLaVA' in selected_OCR_options:
 
1983
  OCR_option_llava = st.radio(
1984
- "Select the LLaVA version",
1985
  options_llava,
1986
  index=default_index_llava,
1987
  help="",captions=captions_llava,
@@ -1989,12 +2057,13 @@ def content_ocr_method():
1989
  st.session_state.config['leafmachine']['project']['OCR_option_llava'] = OCR_option_llava
1990
 
1991
  OCR_option_llava_bit = st.radio(
1992
- "Select the LLaVA quantization level",
1993
  options_llava_bit,
1994
  index=default_index_llava_bit,
1995
  help="",captions=captions_llava_bit,
1996
  )
1997
  st.session_state.config['leafmachine']['project']['OCR_option_llava_bit'] = OCR_option_llava_bit
 
1998
 
1999
 
2000
 
@@ -2045,7 +2114,6 @@ def show_ocr():
2045
  # st.image(st.session_state["demo_overlay"], caption='OCR Overlay Images', output_format = "JPEG")
2046
 
2047
  def content_collage_overlay():
2048
- st.markdown("---")
2049
  col_collage, col_overlay = st.columns([4,4])
2050
 
2051
 
 
218
  st.session_state['dir_images_local_TEMP'] = False
219
  if 'dir_uploaded_images' not in st.session_state:
220
  st.session_state['dir_uploaded_images'] = os.path.join(st.session_state.dir_home,'uploads')
221
+ validate_dir(os.path.join(st.session_state.dir_home,'uploads'))
222
  if 'dir_uploaded_images_small' not in st.session_state:
223
  st.session_state['dir_uploaded_images_small'] = os.path.join(st.session_state.dir_home,'uploads_small')
224
+ validate_dir(os.path.join(st.session_state.dir_home,'uploads_small'))
225
 
226
 
227
 
 
264
 
265
  ind_small = 0
266
  for uploaded_file in uploaded_files:
267
+
268
  if SAFE.check_for_inappropriate_content(uploaded_file):
269
  clear_image_uploads()
270
  report_violation(uploaded_file.name, is_hf=st.session_state['is_hf'])
271
  st.error("Warning: You uploaded an image that violates our terms of service.")
272
+ return True
273
 
274
 
275
  # Determine the file type
276
  if uploaded_file.name.lower().endswith('.pdf'):
277
  # Handle PDF files
278
+ file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file)
279
  # Convert each page of the PDF to an image
280
  n_pages = convert_pdf_to_jpg(file_path, st.session_state['dir_uploaded_images'], dpi=200)#st.session_state.config['leafmachine']['project']['dir_images_local'])
281
  # Update the input list for each page image
 
290
  # Optionally, create a thumbnail for the gallery
291
  img = Image.open(jpg_file_path)
292
  img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
293
+ try:
294
  file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], file_name, img)
295
+ except:
296
  file_path_small = save_uploaded_file_local(st.session_state['dir_uploaded_images_small'],st.session_state['dir_uploaded_images_small'], file_name, img)
297
  st.session_state['input_list_small'].append(file_path_small)
298
 
299
  else:
300
  ind_small += 1
301
  # Handle JPG/JPEG files (existing process)
302
+ file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file)
 
 
 
 
 
303
  st.session_state['input_list'].append(file_path)
304
+ if ind_small < MAX_GALLERY_IMAGES +5:
305
+ img = Image.open(file_path)
306
+ img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
307
+ file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], uploaded_file, img)
308
+ st.session_state['input_list_small'].append(file_path_small)
309
 
310
  # After processing all files
311
  st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
 
393
 
394
  with col_right:
395
  if st.session_state.is_hf:
396
+ result = handle_image_upload_and_gallery_hf(uploaded_files)
397
 
398
  else:
399
  st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
 
1764
  st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
1765
 
1766
 
1767
+ # st.header('LLM Version')
1768
+ # col_llm_1, col_llm_2 = st.columns([4,2])
1769
 
1770
+ # with col_llm_1:
1771
+ # GUI_MODEL_LIST = ModelMaps.get_models_gui_list()
1772
+ # st.session_state.config['leafmachine']['LLM_version'] = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(st.session_state.config['leafmachine'].get('LLM_version', ModelMaps.MODELS_GUI_DEFAULT)))
1773
+
1774
+
1775
+ # Determine the default family based on the default model
1776
+ default_model = ModelMaps.MODELS_GUI_DEFAULT
1777
+ default_family = None
1778
+ for family, models in ModelMaps.MODEL_FAMILY.items():
1779
+ if default_model in models:
1780
+ default_family = family
1781
+ break
1782
+
1783
+ st.header("LLM Version")
1784
+
1785
+ col_llm_1, col_llm_2 = st.columns([4, 2])
1786
  with col_llm_1:
1787
+ # Step 1: Select Model Family with default family pre-selected
1788
+ family_list = list(ModelMaps.MODEL_FAMILY.keys())
1789
+ selected_family = st.selectbox("Select Model Family", family_list, index=family_list.index(default_family) if default_family else 0)
1790
+
1791
+ # Step 2: Display Models based on selected family
1792
+ GUI_MODEL_LIST = ModelMaps.get_models_gui_list_family(selected_family)
1793
+
1794
+ # Ensure the selected model is part of the current family; if not, use the default of this family
1795
+ selected_model_default = st.session_state.config['leafmachine'].get('LLM_version', default_model)
1796
+ if selected_model_default not in GUI_MODEL_LIST:
1797
+ selected_model_default = GUI_MODEL_LIST[0]
1798
+
1799
+ selected_model = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(selected_model_default))
1800
+
1801
+ # Update the session state with the selected model
1802
+ st.session_state.config['leafmachine']['LLM_version'] = selected_model
1803
+
1804
+
1805
+
1806
+
1807
+
1808
  st.markdown("""
1809
  Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
1810
  - Any Mistral model e.g., `Mistral Large`
 
1847
 
1848
 
1849
 
1850
+ def adjust_ocr_options_based_on_capability(capability_score, model_name='llava'):
1851
+ if model_name == 'llava':
1852
+ llava_models_requirements = {
1853
+ "liuhaotian/llava-v1.6-mistral-7b": {"full": 18, "4bit": 9},
1854
+ "liuhaotian/llava-v1.6-34b": {"full": 70, "4bit": 25},
1855
+ "liuhaotian/llava-v1.6-vicuna-13b": {"full": 33, "4bit": 15},
1856
+ "liuhaotian/llava-v1.6-vicuna-7b": {"full": 20, "4bit": 10},
1857
+ }
1858
+ if capability_score == 'no_gpu':
1859
+ return False
1860
+ else:
1861
+ capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
1862
+ supported_models = [model for model, reqs in llava_models_requirements.items()
1863
+ if reqs["full"] <= capability_score_n or reqs["4bit"] <= capability_score_n]
1864
+
1865
+ # If no models are supported, disable the LLaVA option
1866
+ if not supported_models:
1867
+ # Assuming the LLaVA option is the last in your list
1868
+ return False # Indicate LLaVA is not supported
1869
+ return True # Indicate LLaVA is supported
1870
+ elif model_name == 'florence-2':
1871
+ florence_models_requirements = {
1872
+ "microsoft/Florence-2-large": {"full": 16,},
1873
+ "microsoft/Florence-2-base": {"full": 12,},
1874
+ }
1875
+ if capability_score == 'no_gpu':
1876
+ return False
1877
+ else:
1878
+ capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
1879
+ supported_models = [model for model, reqs in florence_models_requirements.items()
1880
+ if reqs["full"] <= capability_score_n]
1881
 
1882
+ # If no models are supported, disable the model option
1883
+ if not supported_models:
1884
+ # Assuming the model option is the last in your list
1885
+ return False # Indicate model is not supported
1886
+ return True # Indicate model is supported
1887
 
1888
 
1889
 
 
1917
 
1918
  c1, c2 = st.columns([4,4])
1919
 
1920
+ with c2:
1921
+ st.subheader("Local Methods")
1922
+ st.write("Local methods are free, but require a capable GPU. ")
1923
+ # Check if LLaVA models are supported based on capability score
1924
+ llava_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score, model_name='llava')
1925
+ florence_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score, model_name='florence-2')
1926
+
1927
+ if llava_supported:
1928
+ st.success("LLaVA models are supported on this computer. A GPU with at least 12 GB of VRAM is available.")
1929
+ else:
1930
+ st.warning("LLaVA models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
1931
+
1932
+ if llava_supported:
1933
+ st.success("Florence-2 models are supported on this computer. A GPU with at least 12 GB of VRAM is available.")
1934
+ else:
1935
+ st.warning("Florence-2 models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
1936
 
1937
  demo_text_h = f"Google_OCR_Handwriting:\nHERBARIUM OF MARCUS W. LYON , JR . Tracaulon sagittatum Indiana : Porter Co. incal Springs edge wet subdunal woods 1927 TX 11 Ilowers pink UNIVERSITE HERBARIUM MICH University of Michigan Herbarium 1439649 copyright reserved PERSICARIA FEB 2 6 1965 cm "
1938
  demo_text_tr = f"trOCR:\nherbarium of marcus w. lyon jr. : : : tracaulon sagittatum indiana porter co. incal springs TX 11 Ilowers pink 1439649 copyright reserved D H U Q "
 
1942
  demo_text_trh = demo_text_h + '\n' + demo_text_tr
1943
  demo_text_trp = demo_text_p + '\n' + demo_text_tr
1944
 
1945
+ options = ["Google Vision Handwritten", "Google Vision Printed", "Florence-2", "GPT-4o-mini", "CRAFT + trOCR","LLaVA", ]
1946
  options_llava = ["llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",]
1947
  options_llava_bit = ["full", "4bit",]
1948
  captions_llava = [
 
1965
  default_index_llava_bit = 0
1966
  with c1:
1967
  st.subheader("API Methods (Google Vision)")
1968
+ st.write("Using APIs for OCR allows VoucherVision to run on most computers. You can use multiple OCR engines simultaneously.")
1969
 
1970
  st.session_state.config['leafmachine']['project']['double_OCR'] = st.checkbox(label="Send 2 copies of the OCR to the LLM",
1971
  help="This can help the LLMs focus attention on the OCR and not get lost in the longer instruction text",
 
1994
  "CRAFT + trOCR": 'CRAFT',
1995
  "LLaVA": 'LLaVA',
1996
  "Florence-2": 'Florence-2',
1997
+ "GPT-4o-mini": "GPT-4o-mini",
1998
  }
1999
 
2000
  # Map selected options to their corresponding internal representations
 
2004
  st.session_state.config['leafmachine']['project']['OCR_option'] = selected_OCR_options
2005
 
2006
 
 
 
 
 
2007
 
2008
+
2009
  if 'CRAFT' in selected_OCR_options:
2010
+ st.subheader('Options for :blue[CRAFT + trOCR]')
2011
+ st.write("Supplement Google Vision OCR with :blue[trOCR] (handwriting OCR) using `microsoft/trocr-base-handwritten`. This option requires Google Vision API and a GPU.")
2012
+ if 'CRAFT' in selected_OCR_options:
2013
+ do_use_trOCR = st.checkbox("Enable :blue[trOCR]", value=True, key="Enable trOCR1",disabled=True)#,disabled=st.session_state['lacks_GPU'])
2014
+ else:
2015
+ do_use_trOCR = st.checkbox("Enable :blue[trOCR]", value=st.session_state.config['leafmachine']['project']['do_use_trOCR'],key="Enable trOCR2")#,disabled=st.session_state['lacks_GPU'])
2016
+ st.session_state.config['leafmachine']['project']['do_use_trOCR'] = do_use_trOCR
2017
+
2018
+ if do_use_trOCR:
2019
+ # st.session_state.config['leafmachine']['project']['trOCR_model_path'] = "microsoft/trocr-large-handwritten"
2020
+ default_trOCR_model_path = st.session_state.config['leafmachine']['project']['trOCR_model_path']
2021
+ user_input_trOCR_model_path = st.text_input(":blue[trOCR] Hugging Face model path. MUST be a fine-tuned version of 'microsoft/trocr-base-handwritten' or 'microsoft/trocr-large-handwritten', or a microsoft :blue[trOCR] model.", value=default_trOCR_model_path)
2022
+ if st.session_state.config['leafmachine']['project']['trOCR_model_path'] != user_input_trOCR_model_path:
2023
+ is_valid_mp = is_valid_huggingface_model_path(user_input_trOCR_model_path)
2024
+ if not is_valid_mp:
2025
+ st.error(f"The Hugging Face model path {user_input_trOCR_model_path} is not valid. Please revise.")
2026
+ else:
2027
+ st.session_state.config['leafmachine']['project']['trOCR_model_path'] = user_input_trOCR_model_path
2028
 
2029
 
2030
  if "Florence-2" in selected_OCR_options:
2031
+ st.subheader('Options for :green[Florence-2]')
2032
  default_florence_model_path = st.session_state.config['leafmachine']['project']['florence_model_path']
 
2033
 
2034
+ st.session_state.config['leafmachine']['project']['florence_model_path'] = st.radio(
2035
+ "Select :green[Florence-2] version.",
2036
+ ["microsoft/Florence-2-large", "microsoft/Florence-2-base", ],
2037
+ captions=["'large' requires at least 16GB of VRAM", "'base' requires 12GB of VRAM."])
2038
+
2039
+ if "GPT-4o-mini" in selected_OCR_options:
2040
+ st.subheader('Options for :violet[GPT-4o-mini]')
2041
+ default_resolution = st.session_state.config['leafmachine']['project']['OCR_GPT_4o_mini_resolution']
2042
+
2043
+ st.session_state.config['leafmachine']['project']['OCR_GPT_4o_mini_resolution'] = st.radio(
2044
+ "Select level of detail for :violet[GPT-4o-mini] OCR. We only recommend 'high' detail in most scenarios.",
2045
+ ["high", "low", ],
2046
+ captions=["$0.50 per 1,000", "\$5 - \$10 per 1,000"])
2047
 
2048
 
2049
  if 'LLaVA' in selected_OCR_options:
2050
+ st.subheader('Options for :red[LLaVA]')
2051
  OCR_option_llava = st.radio(
2052
+ "Select the :red[LLaVA] version",
2053
  options_llava,
2054
  index=default_index_llava,
2055
  help="",captions=captions_llava,
 
2057
  st.session_state.config['leafmachine']['project']['OCR_option_llava'] = OCR_option_llava
2058
 
2059
  OCR_option_llava_bit = st.radio(
2060
+ "Select the :red[LLaVA] quantization level",
2061
  options_llava_bit,
2062
  index=default_index_llava_bit,
2063
  help="",captions=captions_llava_bit,
2064
  )
2065
  st.session_state.config['leafmachine']['project']['OCR_option_llava_bit'] = OCR_option_llava_bit
2066
+ st.write('---')
2067
 
2068
 
2069
 
 
2114
  # st.image(st.session_state["demo_overlay"], caption='OCR Overlay Images', output_format = "JPEG")
2115
 
2116
  def content_collage_overlay():
 
2117
  col_collage, col_overlay = st.columns([4,4])
2118
 
2119
 
vouchervision/OCR_Florence_2.py CHANGED
@@ -6,12 +6,18 @@ import matplotlib.patches as patches
6
  from PIL import Image, ImageDraw, ImageFont
7
  import numpy as np
8
  import warnings
9
- from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
10
- from vouchervision.utils_LLM import SystemLoadMonitor
 
 
 
 
 
11
 
12
  warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
13
 
14
  class FlorenceOCR:
 
15
  def __init__(self, logger, model_id='microsoft/Florence-2-large'):
16
  self.MAX_TOKENS = 1024
17
  self.logger = logger
@@ -25,7 +31,15 @@ class FlorenceOCR:
25
  # self.model_id_clean = "mistralai/Mistral-7B-v0.3"
26
  self.model_id_clean = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
27
  self.tokenizer_clean = AutoTokenizer.from_pretrained(self.model_id_clean)
28
- self.model_clean = AutoModelForCausalLM.from_pretrained(self.model_id_clean)
 
 
 
 
 
 
 
 
29
 
30
 
31
  def ocr_florence(self, image, task_prompt='<OCR>', text_input=None):
@@ -54,34 +68,46 @@ class FlorenceOCR:
54
  num_beams=3,
55
  )
56
  generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
57
- parsed_answer_dirty = self.processor.post_process_generation(
58
  generated_text,
59
  task=task_prompt,
60
  image_size=(image.width, image.height)
61
  )
62
 
63
- inputs = self.tokenizer_clean(f"Insert spaces into this text to make all the words valid. This text contains scientific names of plants, locations, habitat, coordinate words: {parsed_answer_dirty[task_prompt]}", return_tensors="pt")
64
- inputs = {key: value.to(self.model_clean.device) for key, value in inputs.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- outputs = self.model_clean.generate(**inputs, max_new_tokens=self.MAX_TOKENS)
67
- parsed_answer = self.tokenizer_clean.decode(outputs[0], skip_special_tokens=True)
68
- print(parsed_answer_dirty)
69
- print(parsed_answer)
70
 
71
  self.monitor.stop_inference_timer() # Starts tool timer too
72
  usage_report = self.monitor.stop_monitoring_report_usage()
73
 
74
- return parsed_answer, parsed_answer_dirty[task_prompt], parsed_answer_dirty, usage_report
75
 
76
 
77
  def main():
78
- img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
79
- # img = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
80
 
81
  image = Image.open(img_path)
82
 
83
- ocr = FlorenceOCR(logger = None)
84
- results_text, results, usage_report = ocr.ocr_florence(image, task_prompt='<OCR>', text_input=None)
 
85
  print(results_text)
86
 
87
  if __name__ == '__main__':
 
6
  from PIL import Image, ImageDraw, ImageFont
7
  import numpy as np
8
  import warnings
9
+ from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
10
+
11
+ try:
12
+ from vouchervision.utils_LLM import SystemLoadMonitor
13
+ except:
14
+ from utils_LLM import SystemLoadMonitor
15
+
16
 
17
  warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
18
 
19
  class FlorenceOCR:
20
+ # def __init__(self, logger, model_id='microsoft/Florence-2-base'):
21
  def __init__(self, logger, model_id='microsoft/Florence-2-large'):
22
  self.MAX_TOKENS = 1024
23
  self.logger = logger
 
31
  # self.model_id_clean = "mistralai/Mistral-7B-v0.3"
32
  self.model_id_clean = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
33
  self.tokenizer_clean = AutoTokenizer.from_pretrained(self.model_id_clean)
34
+ # Configuring the BitsAndBytesConfig for quantization
35
+ quant_config = BitsAndBytesConfig(
36
+ load_in_4bit=True,
37
+ quant_method="bnb",
38
+ )
39
+ self.model_clean = AutoModelForCausalLM.from_pretrained(
40
+ self.model_id_clean,
41
+ quantization_config=quant_config,
42
+ low_cpu_mem_usage=True,)
43
 
44
 
45
  def ocr_florence(self, image, task_prompt='<OCR>', text_input=None):
 
68
  num_beams=3,
69
  )
70
  generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
71
+ parsed_answer_dict = self.processor.post_process_generation(
72
  generated_text,
73
  task=task_prompt,
74
  image_size=(image.width, image.height)
75
  )
76
 
77
+ parsed_answer_text = parsed_answer_dict[task_prompt]
78
+
79
+ # Prepare input for the second model
80
+ inputs_clean = self.tokenizer_clean(
81
+ f"Insert spaces into this text to make all the words valid. This text contains scientific names of plants, locations, habitat, coordinate words: {parsed_answer_text}",
82
+ return_tensors="pt"
83
+ )
84
+ inputs_clean = {key: value.to(self.model_clean.device) for key, value in inputs_clean.items()}
85
+
86
+ outputs_clean = self.model_clean.generate(**inputs_clean, max_new_tokens=self.MAX_TOKENS)
87
+ text_with_spaces = self.tokenizer_clean.decode(outputs_clean[0], skip_special_tokens=True)
88
+
89
+ # Extract only the LLM response from the decoded text
90
+ response_start = text_with_spaces.find(parsed_answer_text)
91
+ if response_start != -1:
92
+ text_with_spaces = text_with_spaces[response_start + len(parsed_answer_text):].strip()
93
 
94
+ print(text_with_spaces)
 
 
 
95
 
96
  self.monitor.stop_inference_timer() # Starts tool timer too
97
  usage_report = self.monitor.stop_monitoring_report_usage()
98
 
99
+ return text_with_spaces, parsed_answer_text, parsed_answer_dict, usage_report
100
 
101
 
102
  def main():
103
+ # img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
104
+ img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
105
 
106
  image = Image.open(img_path)
107
 
108
+ # ocr = FlorenceOCR(logger = None, model_id='microsoft/Florence-2-base')
109
+ ocr = FlorenceOCR(logger = None, model_id='microsoft/Florence-2-large')
110
+ results_text, results_all, results_dirty, usage_report = ocr.ocr_florence(image, task_prompt='<OCR>', text_input=None)
111
  print(results_text)
112
 
113
  if __name__ == '__main__':
vouchervision/OCR_GPT4oMini.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, requests, yaml
2
+ from PIL import Image
3
+ from openai import OpenAI
4
+
5
+ from general_utils import calculate_cost
6
+
7
+ # PROMPT = """Please perform OCR on this scientific image and extract the printed and handwritten text verbatim. Do not explain your answer, only return the verbatim text in this JSON dictionary format: {'printed_text': '', 'handwritten_text': ''}"""
8
+ PROMPT = """Please perform OCR on this scientific image and extract all of the words and text verbatim. Do not explain your answer, only return the verbatim text:"""
9
+
10
+ class GPT4oMiniOCR:
11
+ def __init__(self, api_key):
12
+ self.api_key = api_key
13
+ self.path_api_cost = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'api_cost', 'api_cost.yaml')
14
+
15
+
16
+ def encode_image(self, image_path):
17
+ with open(image_path, "rb") as image_file:
18
+ return base64.b64encode(image_file.read()).decode('utf-8')
19
+
20
+ def ocr_gpt4o(self, image_path, resolution="low", max_tokens=512):
21
+ # Getting the base64 string
22
+ base64_image = self.encode_image(image_path)
23
+
24
+ headers = {
25
+ "Content-Type": "application/json",
26
+ "Authorization": f"Bearer {self.api_key}"
27
+ }
28
+
29
+ payload = {
30
+ "model": "gpt-4o-mini",
31
+ "messages": [
32
+ {
33
+ "role": "user",
34
+ "content": [
35
+ {
36
+ "type": "text",
37
+ "text": PROMPT,
38
+ },
39
+ {
40
+ "type": "image_url",
41
+ "image_url": {
42
+ "url": f"data:image/jpeg;base64,{base64_image}",
43
+ "detail": resolution,
44
+ }
45
+ }
46
+ ]
47
+ }
48
+ ],
49
+ "max_tokens": max_tokens
50
+ }
51
+
52
+ response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
53
+ response_json = response.json()
54
+
55
+ if "choices" in response_json :
56
+ parsed_answer = response_json["choices"][0]["message"]["content"]
57
+ else:
58
+ parsed_answer = None
59
+
60
+ usage_report = response_json.get('usage', {})
61
+ tokens_in = usage_report["prompt_tokens"]
62
+ tokens_out = usage_report["completion_tokens"]
63
+
64
+ total_cost = calculate_cost('GPT_4o_mini_2024_07_18', self.path_api_cost, tokens_in, tokens_out)
65
+ cost_in, cost_out, total_cost, rates_in, rates_out = total_cost
66
+
67
+ return parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out
68
+
69
+
70
+
71
+
72
+ def main():
73
+ # img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
74
+ img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
75
+
76
+ # $env:OPENAI_API_KEY="KEY"
77
+ API_KEY = "sk-proj-DxHlMH1H6jZzs8V12qbLT3BlbkFJIJnAVzt4kquOfhGURGW0"
78
+
79
+
80
+ ocr = GPT4oMiniOCR(API_KEY)
81
+
82
+ parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="low", max_tokens=512)
83
+ print(f"Parsed Answer: {parsed_answer}")
84
+ print(f"Total Cost: {total_cost}")
85
+
86
+ parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="high", max_tokens=512)
87
+ print(f"Parsed Answer: {parsed_answer}")
88
+ print(f"Total Cost: {total_cost}")
89
+
90
+
91
+
92
+
93
+ if __name__ == '__main__':
94
+ main()
vouchervision/OCR_google_cloud_vision.py CHANGED
@@ -8,6 +8,7 @@ import colorsys
8
  from tqdm import tqdm
9
  from google.oauth2 import service_account
10
  from OCR_Florence_2 import FlorenceOCR
 
11
  ### LLaVA should only be installed if the user will actually use it.
12
  ### It requires the most recent pytorch/Python and can mess with older systems
13
 
@@ -56,6 +57,11 @@ class OCREngine:
56
 
57
  self.OCR_JSON_to_file = {}
58
 
 
 
 
 
 
59
  self.hand_cleaned_text = None
60
  self.hand_organized_text = None
61
  self.hand_bounds = None
@@ -84,6 +90,7 @@ class OCREngine:
84
  self.trOCR_characters = None
85
  self.set_client()
86
  self.init_florence()
 
87
  self.init_craft()
88
 
89
  self.multimodal_prompt = """I need you to transcribe all of the text in this image.
@@ -125,6 +132,10 @@ class OCREngine:
125
  if 'Florence-2' in self.OCR_option:
126
  self.Florence = FlorenceOCR(logger=self.logger, model_id=self.cfg['leafmachine']['project']['florence_model_path'])
127
 
 
 
 
 
128
  def init_llava(self):
129
  if 'LLaVA' in self.OCR_option:
130
  from vouchervision.OCR_llava import OCRllava
@@ -701,7 +712,7 @@ class OCREngine:
701
 
702
  if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
703
  if self.json_report:
704
- self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} transcription :construction:')
705
 
706
  image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
707
  self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
@@ -716,7 +727,7 @@ class OCREngine:
716
 
717
  if 'Florence-2' in self.OCR_option: # This option does not produce an OCR helper image
718
  if self.json_report:
719
- self.json_report.set_text(text_main=f'Working on Florence-2 [{self.Florence.model_id}] transcription :construction:')
720
 
721
  self.logger.info(f"Florence-2 Usage Report for Model [{self.Florence.model_id}]")
722
  results_text, results_text_dirty, results, usage_report = self.Florence.ocr_florence(self.path, task_prompt='<OCR>', text_input=None)
@@ -728,6 +739,21 @@ class OCREngine:
728
  else:
729
  self.OCR = self.OCR + f"\nFlorence-2 OCR:\n{results_text}"
730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
732
  if 'normal' in self.OCR_option:
733
  if self.double_OCR:
@@ -824,48 +850,44 @@ class SafetyCheck():
824
  else:
825
  self.client = vision.ImageAnnotatorClient(credentials=self.get_google_credentials())
826
 
 
827
  def get_google_credentials(self):
828
  creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
829
  credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
830
  return credentials
831
 
832
  def check_for_inappropriate_content(self, file_stream):
833
- try:
834
- LEVEL = 2
835
- # content = file_stream.read()
836
- file_stream.seek(0) # Reset file stream position to the beginning
837
- content = file_stream.read()
838
- image = vision.Image(content=content)
839
- response = self.client.safe_search_detection(image=image)
840
- safe = response.safe_search_annotation
841
-
842
- likelihood_name = (
843
- "UNKNOWN",
844
- "VERY_UNLIKELY",
845
- "UNLIKELY",
846
- "POSSIBLE",
847
- "LIKELY",
848
- "VERY_LIKELY",
849
- )
850
- print("Safe search:")
851
-
852
- print(f" adult*: {likelihood_name[safe.adult]}")
853
- print(f" medical*: {likelihood_name[safe.medical]}")
854
- print(f" spoofed: {likelihood_name[safe.spoof]}")
855
- print(f" violence*: {likelihood_name[safe.violence]}")
856
- print(f" racy: {likelihood_name[safe.racy]}")
857
-
858
- # Check the levels of adult, violence, racy, etc. content.
859
- if (safe.adult > LEVEL or
860
- safe.medical > LEVEL or
861
- # safe.spoof > LEVEL or
862
- safe.violence > LEVEL #or
863
- # safe.racy > LEVEL
864
- ):
865
- print("Found violation")
866
- return True # The image violates safe search guidelines.
867
-
868
- print("Found NO violation")
869
- return False # The image is considered safe.
870
- except:
871
- return False # The image is considered safe. TEMPOROARY FIX TODO
 
8
  from tqdm import tqdm
9
  from google.oauth2 import service_account
10
  from OCR_Florence_2 import FlorenceOCR
11
+ from OCR_GPT4oMini import GPT4oMiniOCR
12
  ### LLaVA should only be installed if the user will actually use it.
13
  ### It requires the most recent pytorch/Python and can mess with older systems
14
 
 
57
 
58
  self.OCR_JSON_to_file = {}
59
 
60
+ # for paid vLM OCR like GPT-vision
61
+ self.cost = 0.0
62
+ self.tokens_in = 0
63
+ self.tokens_out = 0
64
+
65
  self.hand_cleaned_text = None
66
  self.hand_organized_text = None
67
  self.hand_bounds = None
 
90
  self.trOCR_characters = None
91
  self.set_client()
92
  self.init_florence()
93
+ self.init_gpt_4o_mini()
94
  self.init_craft()
95
 
96
  self.multimodal_prompt = """I need you to transcribe all of the text in this image.
 
132
  if 'Florence-2' in self.OCR_option:
133
  self.Florence = FlorenceOCR(logger=self.logger, model_id=self.cfg['leafmachine']['project']['florence_model_path'])
134
 
135
+ def init_gpt_4o_mini(self):
136
+ if 'GPT-4o-mini' in self.OCR_option:
137
+ self.GPTmini = GPT4oMiniOCR(api_key = os.getenv('OPENAI_API_KEY'))
138
+
139
  def init_llava(self):
140
  if 'LLaVA' in self.OCR_option:
141
  from vouchervision.OCR_llava import OCRllava
 
712
 
713
  if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
714
  if self.json_report:
715
+ self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} OCR :construction:')
716
 
717
  image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
718
  self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
 
727
 
728
  if 'Florence-2' in self.OCR_option: # This option does not produce an OCR helper image
729
  if self.json_report:
730
+ self.json_report.set_text(text_main=f'Working on Florence-2 [{self.Florence.model_id}] OCR :construction:')
731
 
732
  self.logger.info(f"Florence-2 Usage Report for Model [{self.Florence.model_id}]")
733
  results_text, results_text_dirty, results, usage_report = self.Florence.ocr_florence(self.path, task_prompt='<OCR>', text_input=None)
 
739
  else:
740
  self.OCR = self.OCR + f"\nFlorence-2 OCR:\n{results_text}"
741
 
742
+ if 'GPT-4o-mini' in self.OCR_option: # This option does not produce an OCR helper image
743
+ if self.json_report:
744
+ self.json_report.set_text(text_main=f'Working on GPT-4o-mini OCR :construction:')
745
+
746
+ self.logger.info(f"GPT-4o-mini Usage Report")
747
+ results_text, cost_in, cost_out, total_cost, rates_in, rates_out, self.tokens_in, self.tokens_out = self.GPTmini.ocr_gpt4o(self.path, resolution=self.cfg['leafmachine']['project']['OCR_GPT_4o_mini_resolution'], max_tokens=512)
748
+ self.cost += total_cost
749
+
750
+ self.OCR_JSON_to_file['OCR_GPT_4o_mini'] = results_text
751
+
752
+ if self.double_OCR:
753
+ self.OCR = self.OCR + f"\nGPT-4o-mini OCR:\n{results_text}" + f"\nGPT-4o-mini OCR:\n{results_text}"
754
+ else:
755
+ self.OCR = self.OCR + f"\nGPT-4o-mini OCR:\n{results_text}"
756
+
757
  if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
758
  if 'normal' in self.OCR_option:
759
  if self.double_OCR:
 
850
  else:
851
  self.client = vision.ImageAnnotatorClient(credentials=self.get_google_credentials())
852
 
853
+
854
  def get_google_credentials(self):
855
  creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
856
  credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
857
  return credentials
858
 
859
  def check_for_inappropriate_content(self, file_stream):
860
+ LEVEL = 2
861
+ content = file_stream.read()
862
+ image = vision.Image(content=content)
863
+ response = self.client.safe_search_detection(image=image)
864
+ safe = response.safe_search_annotation
865
+
866
+ likelihood_name = (
867
+ "UNKNOWN",
868
+ "VERY_UNLIKELY",
869
+ "UNLIKELY",
870
+ "POSSIBLE",
871
+ "LIKELY",
872
+ "VERY_LIKELY",
873
+ )
874
+ print("Safe search:")
875
+
876
+ print(f" adult*: {likelihood_name[safe.adult]}")
877
+ print(f" medical*: {likelihood_name[safe.medical]}")
878
+ print(f" spoofed: {likelihood_name[safe.spoof]}")
879
+ print(f" violence*: {likelihood_name[safe.violence]}")
880
+ print(f" racy: {likelihood_name[safe.racy]}")
881
+
882
+ # Check the levels of adult, violence, racy, etc. content.
883
+ if (safe.adult > LEVEL or
884
+ safe.medical > LEVEL or
885
+ # safe.spoof > LEVEL or
886
+ safe.violence > LEVEL #or
887
+ # safe.racy > LEVEL
888
+ ):
889
+ print("Found violation")
890
+ return True # The image violates safe search guidelines.
891
+
892
+ print("Found NO violation")
893
+ return False # The image is considered safe.
 
 
 
 
 
vouchervision/VoucherVision_Config_Builder.py CHANGED
@@ -42,6 +42,7 @@ def build_VV_config(loaded_cfg=None):
42
  OCR_option = 'hand'
43
  OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
44
  OCR_option_llava_bit = 'full' # full or 4bit
 
45
  double_OCR = False
46
 
47
  tool_GEO = True
@@ -73,7 +74,7 @@ def build_VV_config(loaded_cfg=None):
73
  prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
74
  path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
75
  prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
76
- OCR_option_llava_bit, double_OCR, save_cropped_annotations,
77
  tool_GEO, tool_WFO, tool_wikipedia,
78
  check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
79
  else:
@@ -95,6 +96,7 @@ def build_VV_config(loaded_cfg=None):
95
  OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
96
  OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
97
  OCR_option_llava_bit = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
 
98
  double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
99
 
100
  tool_GEO = loaded_cfg['leafmachine']['project']['tool_GEO']
@@ -122,7 +124,7 @@ def build_VV_config(loaded_cfg=None):
122
  prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
123
  path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
124
  prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
125
- OCR_option_llava_bit, double_OCR, save_cropped_annotations,
126
  tool_GEO, tool_WFO, tool_wikipedia,
127
  check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
128
 
@@ -131,7 +133,7 @@ def assemble_config(dir_home, run_name, dir_images_local,dir_output,
131
  prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
132
  path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
133
  prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
134
- OCR_option_llava_bit, double_OCR, save_cropped_annotations,
135
  tool_GEO, tool_WFO, tool_wikipedia,
136
  check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
137
 
@@ -183,6 +185,7 @@ def assemble_config(dir_home, run_name, dir_images_local,dir_output,
183
  'OCR_option': OCR_option,
184
  'OCR_option_llava': OCR_option_llava,
185
  'OCR_option_llava_bit': OCR_option_llava_bit,
 
186
  'double_OCR': double_OCR,
187
  'pdf_conversion_dpi': pdf_conversion_dpi,
188
  'tool_GEO': tool_GEO,
 
42
  OCR_option = 'hand'
43
  OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
44
  OCR_option_llava_bit = 'full' # full or 4bit
45
+ OCR_GPT_4o_mini_resolution = 'high'
46
  double_OCR = False
47
 
48
  tool_GEO = True
 
74
  prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
75
  path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
76
  prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
77
+ OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
78
  tool_GEO, tool_WFO, tool_wikipedia,
79
  check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
80
  else:
 
96
  OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
97
  OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
98
  OCR_option_llava_bit = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
99
+ OCR_GPT_4o_mini_resolution = loaded_cfg['leafmachine']['project']['OCR_GPT_4o_mini_resolution']
100
  double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
101
 
102
  tool_GEO = loaded_cfg['leafmachine']['project']['tool_GEO']
 
124
  prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
125
  path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
126
  prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
127
+ OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
128
  tool_GEO, tool_WFO, tool_wikipedia,
129
  check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
130
 
 
133
  prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
134
  path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
135
  prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
136
+ OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
137
  tool_GEO, tool_WFO, tool_wikipedia,
138
  check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
139
 
 
185
  'OCR_option': OCR_option,
186
  'OCR_option_llava': OCR_option_llava,
187
  'OCR_option_llava_bit': OCR_option_llava_bit,
188
+ 'OCR_GPT_4o_mini_resolution': OCR_GPT_4o_mini_resolution,
189
  'double_OCR': double_OCR,
190
  'pdf_conversion_dpi': pdf_conversion_dpi,
191
  'tool_GEO': tool_GEO,
vouchervision/general_utils.py CHANGED
@@ -10,7 +10,11 @@ import concurrent.futures
10
  from time import perf_counter
11
  import torch
12
 
13
- from vouchervision.model_maps import ModelMaps
 
 
 
 
14
 
15
  '''
16
  TIFF --> DNG
@@ -65,12 +69,12 @@ def add_to_expense_report(dir_home, data):
65
 
66
  # If the file does not exist, write the header first
67
  if not file_exists:
68
- writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out',])
69
 
70
  # Write the data row
71
  writer.writerow(data)
72
 
73
- def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, total_tokens_out, n_images, dir_home, logger):
74
  if path_api_cost:
75
  LLM_version = ModelMaps.get_version_mapping_cost(LLM_version0)
76
 
@@ -78,16 +82,18 @@ def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, t
78
  csv_file_path = os.path.join(Dirs.path_cost, Dirs.run_name + '.csv')
79
 
80
  cost_in, cost_out, total_cost, rate_in, rate_out = calculate_cost(LLM_version, path_api_cost, total_tokens_in, total_tokens_out)
 
 
81
 
82
  # The data to be written to the CSV file
83
- data = [Dirs.run_name, get_datetime(),LLM_version, total_cost, n_images, total_tokens_in, total_tokens_out, rate_in, rate_out, cost_in, cost_out,]
84
 
85
  # Open the file in write mode
86
  with open(csv_file_path, mode='w', newline='') as file:
87
  writer = csv.writer(file)
88
 
89
  # Write the header
90
- writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out',])
91
 
92
  # Write the data
93
  writer.writerow(data)
@@ -119,6 +125,11 @@ def summarize_expense_report(path_expense_report):
119
  cost_in_sum = 0
120
  cost_out_sum = 0
121
  n_images_sum = 0
 
 
 
 
 
122
  api_version_counts = Counter()
123
 
124
  # Try to read the CSV file into a DataFrame
@@ -128,7 +139,7 @@ def summarize_expense_report(path_expense_report):
128
  # Process each row in the DataFrame
129
  for index, row in df.iterrows():
130
  run_count += 1
131
- total_cost_sum += row['total_cost']
132
  tokens_in_sum += row['tokens_in']
133
  tokens_out_sum += row['tokens_out']
134
  rate_in_sum += row['rate_in']
@@ -136,6 +147,9 @@ def summarize_expense_report(path_expense_report):
136
  cost_in_sum += row['cost_in']
137
  cost_out_sum += row['cost_out']
138
  n_images_sum += row['n_images']
 
 
 
139
  api_version_counts[row['api_version']] += 1
140
 
141
  except FileNotFoundError:
@@ -163,6 +177,9 @@ def summarize_expense_report(path_expense_report):
163
  'rate_out_sum': rate_out_sum,
164
  'cost_in_sum': cost_in_sum,
165
  'cost_out_sum': cost_out_sum,
 
 
 
166
  'n_images_sum':n_images_sum,
167
  'api_version_percentages': api_version_percentages,
168
  'cost_per_image': cost_per_image_dict
 
10
  from time import perf_counter
11
  import torch
12
 
13
+ try:
14
+ from vouchervision.model_maps import ModelMaps
15
+ except:
16
+ from model_maps import ModelMaps
17
+
18
 
19
  '''
20
  TIFF --> DNG
 
69
 
70
  # If the file does not exist, write the header first
71
  if not file_exists:
72
+ writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out','ocr_cost','ocr_tokens_in', 'ocr_tokens_out',])
73
 
74
  # Write the data row
75
  writer.writerow(data)
76
 
77
+ def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out, n_images, dir_home, logger):
78
  if path_api_cost:
79
  LLM_version = ModelMaps.get_version_mapping_cost(LLM_version0)
80
 
 
82
  csv_file_path = os.path.join(Dirs.path_cost, Dirs.run_name + '.csv')
83
 
84
  cost_in, cost_out, total_cost, rate_in, rate_out = calculate_cost(LLM_version, path_api_cost, total_tokens_in, total_tokens_out)
85
+
86
+ total_cost += OCR_cost
87
 
88
  # The data to be written to the CSV file
89
+ data = [Dirs.run_name, get_datetime(),LLM_version, total_cost, n_images, total_tokens_in, total_tokens_out, rate_in, rate_out, cost_in, cost_out,OCR_cost, OCR_tokens_in, OCR_tokens_out,]
90
 
91
  # Open the file in write mode
92
  with open(csv_file_path, mode='w', newline='') as file:
93
  writer = csv.writer(file)
94
 
95
  # Write the header
96
+ writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out','ocr_cost','ocr_tokens_in', 'ocr_tokens_out'])
97
 
98
  # Write the data
99
  writer.writerow(data)
 
125
  cost_in_sum = 0
126
  cost_out_sum = 0
127
  n_images_sum = 0
128
+ # ,'ocr_cost','ocr_tokens_in', 'ocr_tokens_out'
129
+ ocr_cost_sum = 0
130
+ ocr_tokens_in_sum = 0
131
+ ocr_tokens_out_sum = 0
132
+
133
  api_version_counts = Counter()
134
 
135
  # Try to read the CSV file into a DataFrame
 
139
  # Process each row in the DataFrame
140
  for index, row in df.iterrows():
141
  run_count += 1
142
+ total_cost_sum += row['total_cost'] + row['ocr_cost']
143
  tokens_in_sum += row['tokens_in']
144
  tokens_out_sum += row['tokens_out']
145
  rate_in_sum += row['rate_in']
 
147
  cost_in_sum += row['cost_in']
148
  cost_out_sum += row['cost_out']
149
  n_images_sum += row['n_images']
150
+ ocr_cost_sum += row['ocr_cost']
151
+ ocr_tokens_in_sum += row['ocr_tokens_in']
152
+ ocr_tokens_out_sum += row['ocr_tokens_out']
153
  api_version_counts[row['api_version']] += 1
154
 
155
  except FileNotFoundError:
 
177
  'rate_out_sum': rate_out_sum,
178
  'cost_in_sum': cost_in_sum,
179
  'cost_out_sum': cost_out_sum,
180
+ 'ocr_cost_sum': ocr_cost_sum,
181
+ 'ocr_tokens_in_sum': ocr_tokens_in_sum,
182
+ 'ocr_tokens_out_sum': ocr_tokens_out_sum,
183
  'n_images_sum':n_images_sum,
184
  'api_version_percentages': api_version_percentages,
185
  'cost_per_image': cost_per_image_dict
vouchervision/model_maps.py CHANGED
@@ -40,23 +40,27 @@ class ModelMaps:
40
  'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05': '#bababa', # Gray
41
  }
42
 
43
- MODELS_OPENAI = ["GPT 4",
44
- "GPT 4 32k",
45
- "GPT 4o 2024-05-13", #GPT_4o_2024_05_13
46
- "GPT 4o mini 2024-07-18",
47
- "GPT 4 Turbo 2024-04-09",#GPT_4_TURBO_2024_04_09
48
- "GPT 4 Turbo 0125-preview",
49
- "GPT 4 Turbo 1106-preview",
50
- "GPT 3.5 Turbo",
51
- "GPT 3.5 Instruct",
52
-
53
- "Azure GPT 4",
 
 
 
 
54
  # "Azure GPT 4 32k",
55
  # "Azure GPT 4 Turbo 0125-preview",
56
  # "Azure GPT 4 Turbo 1106-preview",
57
  # "Azure GPT 3.5 Turbo",
58
  # "Azure GPT 3.5 Instruct",
59
- ]
60
 
61
  MODELS_GOOGLE = [
62
  # "PaLM 2 text-bison@001",
@@ -79,7 +83,14 @@ class ModelMaps:
79
  "LOCAL CPU Mistral 7B Instruct v0.2 GGUF",
80
  'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05']
81
 
82
- MODELS_GUI_DEFAULT = "Azure GPT 4" # "GPT 4 Turbo 1106-preview"
 
 
 
 
 
 
 
83
 
84
  version_mapping_cost = {
85
  'GPT 4 32k': 'GPT_4_32K',
@@ -316,7 +327,16 @@ class ModelMaps:
316
 
317
  @classmethod
318
  def get_models_gui_list(cls):
319
- return cls.MODELS_LOCAL + cls.MODELS_GOOGLE + cls.MODELS_OPENAI + cls.MODELS_MISTRAL
 
 
 
 
 
 
 
 
 
320
 
321
  @classmethod
322
  def get_version_mapping_cost(cls, key):
 
40
  'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05': '#bababa', # Gray
41
  }
42
 
43
+ MODELS_OPENAI = [
44
+ "GPT 4o 2024-05-13", #GPT_4o_2024_05_13
45
+ "GPT 4o mini 2024-07-18",
46
+ "GPT 4 Turbo 2024-04-09",#GPT_4_TURBO_2024_04_09
47
+ "GPT 4",
48
+ "GPT 4 32k",
49
+ "GPT 4 Turbo 0125-preview",
50
+ "GPT 4 Turbo 1106-preview",
51
+ "GPT 3.5 Turbo",
52
+ "GPT 3.5 Instruct",
53
+ ]
54
+
55
+
56
+ MODELS_OPENAI_AZURE = [
57
+ "Azure GPT 4",
58
  # "Azure GPT 4 32k",
59
  # "Azure GPT 4 Turbo 0125-preview",
60
  # "Azure GPT 4 Turbo 1106-preview",
61
  # "Azure GPT 3.5 Turbo",
62
  # "Azure GPT 3.5 Instruct",
63
+ ]
64
 
65
  MODELS_GOOGLE = [
66
  # "PaLM 2 text-bison@001",
 
83
  "LOCAL CPU Mistral 7B Instruct v0.2 GGUF",
84
  'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05']
85
 
86
+ MODELS_GUI_DEFAULT = "Gemini 1.5 Flash" #"Azure GPT 4" # "GPT 4 Turbo 1106-preview"
87
+
88
+ MODEL_FAMILY = {
89
+ 'OpenAI': MODELS_OPENAI,
90
+ 'OpenAI Azure': MODELS_OPENAI_AZURE,
91
+ 'Google': MODELS_GOOGLE,
92
+ 'Mistral': MODELS_MISTRAL,
93
+ 'Local': MODELS_LOCAL}
94
 
95
  version_mapping_cost = {
96
  'GPT 4 32k': 'GPT_4_32K',
 
327
 
328
  @classmethod
329
  def get_models_gui_list(cls):
330
+ return cls.MODELS_LOCAL + cls.MODELS_GOOGLE + cls.MODELS_OPENAI + cls.MODELS_OPENAI_AZURE + cls.MODELS_MISTRAL
331
+
332
+ @classmethod
333
+ def get_models_gui_list_family(cls, family=None):
334
+ if family and family in cls.MODEL_FAMILY:
335
+ return cls.MODEL_FAMILY[family]
336
+ all_models = []
337
+ for family_models in cls.MODEL_FAMILY.values():
338
+ all_models.extend(family_models)
339
+ return all_models
340
 
341
  @classmethod
342
  def get_version_mapping_cost(cls, key):
vouchervision/utils_LLM.py CHANGED
@@ -8,11 +8,16 @@ import psutil
8
  import threading
9
  import torch
10
  from datetime import datetime
11
- from vouchervision.tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
12
- from vouchervision.tool_geolocate_HERE import validate_coordinates_here
13
- from vouchervision.tool_wikipedia import validate_wikipedia
14
  from concurrent.futures import ThreadPoolExecutor, as_completed
15
 
 
 
 
 
 
 
 
 
16
 
17
  def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
18
  # Define a function that will catch and return the results of your functions
@@ -179,15 +184,26 @@ class SystemLoadMonitor():
179
 
180
  }
181
 
182
- self.logger.info(f"Inference Time: {round(self.inference_time,2)} seconds")
183
- self.logger.info(f"Tool Time: {round(tool_time,2)} seconds")
184
- self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
185
- self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
 
 
 
 
 
 
 
186
  if self.has_GPU:
187
  report.update({'max_gpu_load': str(round(self.gpu_usage['max_load'] * 100, 2))})
188
  report.update({'max_gpu_vram_gb': str(round(self.gpu_usage['max_vram_usage'], 2))})
189
- self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
190
- self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
 
 
 
 
191
  else:
192
  report.update({'max_gpu_load': '0'})
193
  report.update({'max_gpu_vram_gb': '0'})
 
8
  import threading
9
  import torch
10
  from datetime import datetime
 
 
 
11
  from concurrent.futures import ThreadPoolExecutor, as_completed
12
 
13
+ try:
14
+ from vouchervision.tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
15
+ from vouchervision.tool_geolocate_HERE import validate_coordinates_here
16
+ from vouchervision.tool_wikipedia import validate_wikipedia
17
+ except:
18
+ from tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
19
+ from tool_geolocate_HERE import validate_coordinates_here
20
+ from tool_wikipedia import validate_wikipedia
21
 
22
  def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
23
  # Define a function that will catch and return the results of your functions
 
184
 
185
  }
186
 
187
+ if self.logger:
188
+ self.logger.info(f"Inference Time: {round(self.inference_time,2)} seconds")
189
+ self.logger.info(f"Tool Time: {round(tool_time,2)} seconds")
190
+ self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
191
+ self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
192
+ else:
193
+ print(f"Inference Time: {round(self.inference_time,2)} seconds")
194
+ print(f"Tool Time: {round(tool_time,2)} seconds")
195
+ print(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
196
+ print(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
197
+
198
  if self.has_GPU:
199
  report.update({'max_gpu_load': str(round(self.gpu_usage['max_load'] * 100, 2))})
200
  report.update({'max_gpu_vram_gb': str(round(self.gpu_usage['max_vram_usage'], 2))})
201
+ if self.logger:
202
+ self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
203
+ self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
204
+ else:
205
+ print(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
206
+ print(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
207
  else:
208
  report.update({'max_gpu_load': '0'})
209
  report.update({'max_gpu_vram_gb': '0'})
vouchervision/utils_VoucherVision.py CHANGED
@@ -43,6 +43,10 @@ class VoucherVision():
43
  self.prompt_version = None
44
  self.is_hf = is_hf
45
 
 
 
 
 
46
  ### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
47
  self.config_vals_for_permutation = config_vals_for_permutation
48
 
@@ -649,11 +653,19 @@ class VoucherVision():
649
  def perform_OCR_and_save_results(self, image_index, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds):
650
  self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Starting OCR')
651
  # self.OCR - None
 
 
 
652
 
653
  ### Process_image() runs the OCR for text, handwriting, trOCR AND creates the overlay image
654
  ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
655
  ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
656
  self.OCR = ocr_google.OCR
 
 
 
 
 
657
  self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
658
 
659
  self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
@@ -774,7 +786,8 @@ class VoucherVision():
774
 
775
  self.update_progress_report_final(progress_report)
776
  final_JSON_response = self.parse_final_json_response(final_JSON_response)
777
- return final_JSON_response, final_WFO_record, final_GEO_record, self.total_tokens_in, self.total_tokens_out
 
778
 
779
 
780
  ##################################################################################################################################
@@ -905,9 +918,9 @@ class VoucherVision():
905
  if is_real_run:
906
  progress_report.update_overall(f"Transcribing Labels")
907
 
908
- final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out = self.send_to_LLM(self.is_azure, progress_report, json_report, self.model_name)
909
 
910
- return final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out
911
 
912
  except Exception as e:
913
  self.logger.error(f"LLM call failed in process_specimen_batch: {e}")
 
43
  self.prompt_version = None
44
  self.is_hf = is_hf
45
 
46
+ self.OCR_cost = 0.0
47
+ self.OCR_tokens_in = 0
48
+ self.OCR_tokens_out = 0
49
+
50
  ### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
51
  self.config_vals_for_permutation = config_vals_for_permutation
52
 
 
653
  def perform_OCR_and_save_results(self, image_index, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds):
654
  self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Starting OCR')
655
  # self.OCR - None
656
+ self.OCR_cost = 0.0
657
+ self.OCR_tokens_in = 0
658
+ self.OCR_tokens_out = 0
659
 
660
  ### Process_image() runs the OCR for text, handwriting, trOCR AND creates the overlay image
661
  ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
662
  ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
663
  self.OCR = ocr_google.OCR
664
+
665
+ self.OCR_cost = ocr_google.cost
666
+ self.OCR_tokens_in = ocr_google.tokens_in
667
+ self.OCR_tokens_out = ocr_google.tokens_out
668
+
669
  self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
670
 
671
  self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
 
786
 
787
  self.update_progress_report_final(progress_report)
788
  final_JSON_response = self.parse_final_json_response(final_JSON_response)
789
+
790
+ return final_JSON_response, final_WFO_record, final_GEO_record, self.total_tokens_in, self.total_tokens_out, self.OCR_cost, self.OCR_tokens_in, self.OCR_tokens_out
791
 
792
 
793
  ##################################################################################################################################
 
918
  if is_real_run:
919
  progress_report.update_overall(f"Transcribing Labels")
920
 
921
+ final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out = self.send_to_LLM(self.is_azure, progress_report, json_report, self.model_name)
922
 
923
+ return final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out
924
 
925
  except Exception as e:
926
  self.logger.error(f"LLM call failed in process_specimen_batch: {e}")
vouchervision/vouchervision_main.py CHANGED
@@ -65,9 +65,9 @@ def voucher_vision(cfg_file_path, dir_home, path_custom_prompts, cfg_test, progr
65
  # Process labels
66
  Voucher_Vision = VoucherVision(cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf)
67
  n_images = len(Voucher_Vision.img_paths)
68
- last_JSON_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out = Voucher_Vision.process_specimen_batch(progress_report, json_report, is_real_run)
69
 
70
- total_cost = save_token_info_as_csv(Dirs, cfg['leafmachine']['LLM_version'], path_api_cost, total_tokens_in, total_tokens_out, n_images, dir_home, logger)
71
 
72
  t_overall_s = perf_counter()
73
  logger.name = 'Run Complete! :)'
 
65
  # Process labels
66
  Voucher_Vision = VoucherVision(cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf)
67
  n_images = len(Voucher_Vision.img_paths)
68
+ last_JSON_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out = Voucher_Vision.process_specimen_batch(progress_report, json_report, is_real_run)
69
 
70
+ total_cost = save_token_info_as_csv(Dirs, cfg['leafmachine']['LLM_version'], path_api_cost, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out, n_images, dir_home, logger)
71
 
72
  t_overall_s = perf_counter()
73
  logger.name = 'Run Complete! :)'