import openai import os, json, sys, inspect, time, requests from langchain.output_parsers import StructuredOutputParser, ResponseSchema from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI, AzureChatOpenAI from langchain.schema import HumanMessage from general_utils import num_tokens_from_string currentdir = os.path.dirname(os.path.abspath( inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) sys.path.append(parentdir) from prompts import PROMPT_UMICH_skeleton_all_asia, PROMPT_OCR_Organized, PROMPT_UMICH_skeleton_all_asia_GPT4, PROMPT_OCR_Organized_GPT4, PROMPT_JSON from prompt_catalog import PromptCatalog RETRY_DELAY = 61 # Wait 60 seconds before retrying MAX_RETRIES = 5 # Maximum number of retries def azure_call(model, messages): response = model(messages=messages) return response def OCR_to_dict(is_azure, logger, MODEL, prompt, llm, prompt_version): for i in range(MAX_RETRIES): try: do_use_SOP = True if do_use_SOP: logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser') response = structured_output_parser(is_azure, MODEL, llm, prompt, logger, prompt_version) if response is None: return None else: return response['Dictionary'] else: ### Direct GPT ### logger.info(f'Waiting for {MODEL} API call') if not is_azure: response = openai.ChatCompletion.create( model=MODEL, temperature = 0, messages=[ {"role": "system", "content": "You are a helpful assistant acting as a transcription expert and your job is to transcribe herbarium specimen labels based on OCR data and reformat it to meet Darwin Core Archive Standards into a Python dictionary based on certain rules."}, {"role": "user", "content": prompt}, ], max_tokens=4096, ) # print the model's response return response.choices[0].message['content'] else: msg = HumanMessage( content=prompt ) response = azure_call(llm, [msg]) return response.content except Exception as e: logger.error(f'{e}') if i < MAX_RETRIES - 1: # No delay needed after the last try time.sleep(RETRY_DELAY) else: raise # def OCR_to_dict(logger, MODEL, prompt, OCR, BASE_URL, HEADERS): # for i in range(MAX_RETRIES): # try: # do_use_SOP = False # if do_use_SOP: # logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser -- Content') # response = structured_output_parser(MODEL, OCR, prompt, logger) # if response is None: # return None # else: # return response['Dictionary'] # else: # ### Direct GPT through Azure ### # logger.info(f'Waiting for {MODEL} API call') # response = azure_gpt_request(prompt, BASE_URL, HEADERS, model_name=MODEL) # # Handle the response data. Note: You might need to adjust the following line based on the exact response format of the Azure API. # content = response.get("choices", [{}])[0].get("message", {}).get("content", "") # return content # except requests.exceptions.RequestException as e: # Replace openai.error.APIError with requests exception. # # Handle HTTP exceptions. You can adjust this based on the Azure API's error responses. # if e.response.status_code == 502: # logger.info(f' *** 502 error was encountered, wait and try again ***') # if i < MAX_RETRIES - 1: # time.sleep(RETRY_DELAY) # else: # raise def OCR_to_dict_16k(is_azure, logger, MODEL, prompt, llm, prompt_version): for i in range(MAX_RETRIES): try: fs = FunctionSchema() response = openai.ChatCompletion.create( model=MODEL, temperature = 0, messages=[ {"role": "system", "content": "You are a helpful assistant acting as a transcription expert and your job is to transcribe herbarium specimen labels based on OCR data and reformat it to meet Darwin Core Archive Standards into a Python dictionary based on certain rules."}, {"role": "user", "content": prompt}, ], max_tokens=8000, function_call= "none", functions= fs.format_C21_AA_V1() ) # Try to parse the response into JSON call_failed = False try: response_string = response.choices[0].message['content'] except: call_failed = True response_string = prompt if not call_failed: try: # Try to parse the response into JSON response_dict = json.loads(response_string) return response_dict['Dictionary'] except json.JSONDecodeError: # If the response is not a valid JSON, call the structured_output_parser_for_function_calls_fail function logger.info(f'Invalid JSON response, calling structured_output_parser_for_function_calls_fail function') logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser --- JSON Fixer') response_sop = structured_output_parser_for_function_calls_fail(is_azure, MODEL, response_string, logger, llm, prompt_version, is_helper=False) if response_sop is None: return None else: return response_sop['Dictionary'] else: try: logger.info(f'Call Failed. Attempting fallback JSON parse without guidance') logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser --- JSON Fixer') response_sop = structured_output_parser_for_function_calls_fail(is_azure, MODEL, response_string, logger, llm, prompt_version, is_helper=False) if response_sop is None: return None else: return response_sop['Dictionary'] except: return None except Exception as e: # if e.status_code == 401: # or you can check the error message logger.info(f' *** 401 error was encountered, wait and try again ***') # If a 401 error was encountered, wait and try again if i < MAX_RETRIES - 1: # No delay needed after the last try time.sleep(RETRY_DELAY) else: # If it was a different error, re-raise it raise def structured_output_parser(is_azure, MODEL, llm, prompt_template, logger, prompt_version, is_helper=False): if not is_helper: response_schemas = [ ResponseSchema(name="SpeciesName", description="Taxonomic determination, genus_species"), ResponseSchema(name="Dictionary", description='Formatted JSON object'),]#prompt_template),] elif is_helper: response_schemas = [ ResponseSchema(name="Dictionary", description='Formatted JSON object'),#prompt_template), ResponseSchema(name="Summary", description="A one sentence summary of the content"),] output_parser = StructuredOutputParser.from_response_schemas(response_schemas) format_instructions = output_parser.get_format_instructions() prompt = ChatPromptTemplate( messages=[ HumanMessagePromptTemplate.from_template("Parse the OCR text into the correct structured format.\n{format_instructions}\n{question}") ], input_variables=["question"], partial_variables={"format_instructions": format_instructions} ) # Handle Azure vs OpenAI implementation if is_azure: _input = prompt.format_prompt(question=prompt_template) msg = HumanMessage(content=_input.to_string()) output = azure_call(llm, [msg]) else: chat_model = ChatOpenAI(temperature=0, model=MODEL) _input = prompt.format_prompt(question=prompt_template) output = chat_model(_input.to_messages()) # Log token length if running with Gradio try: nt = num_tokens_from_string(_input.to_string(), "cl100k_base") logger.info(f'Prompt token length --- {nt}') except: pass # Parse the output try: # Check if output is of type 'ai' and parse accordingly if output.type == 'ai': parsed_content = output.content logger.info(f'Formatted JSON\n{parsed_content}') else: # If not 'ai', log and set parsed_content to None or a default value logger.error('Output type is not "ai". Unable to parse.') return None # Clean up the parsed content parsed_content = parsed_content.replace('\n', "").replace('\t', "").replace('|', "") # Attempt to parse the cleaned content try: refined_response = output_parser.parse(parsed_content) return refined_response except Exception as parse_error: # Handle parsing errors specifically logger.error(f'Parsing Error: {parse_error}') return structured_output_parser_for_function_calls_fail(is_azure, MODEL, parsed_content, logger, llm, prompt_version, is_helper) except Exception as e: # Handle any other exceptions that might occur logger.error(f'Unexpected Error: {e}') return None def structured_output_parser_for_function_calls_fail(is_azure, MODEL, failed_response, logger, llm, prompt_version, is_helper=False, try_ind=0): if try_ind > 5: return None # prompt_redo = PROMPT_JSON('helper' if is_helper else 'dict', failed_response) Prompt = PromptCatalog() if prompt_version in ['prompt_v1_verbose', 'prompt_v1_verbose_noDomainKnowledge']: prompt_redo = Prompt.prompt_gpt_redo_v1(failed_response) elif prompt_version in ['prompt_v2_json_rules']: prompt_redo = Prompt.prompt_gpt_redo_v2(failed_response) else: prompt_redo = Prompt.prompt_v2_custom_redo(failed_response, is_palm=False) response_schemas = [ ResponseSchema(name="Summary", description="A one sentence summary of the content"), ResponseSchema(name="Dictionary", description='Formatted JSON object') ] output_parser = StructuredOutputParser.from_response_schemas(response_schemas) format_instructions = output_parser.get_format_instructions() prompt = ChatPromptTemplate( messages=[ HumanMessagePromptTemplate.from_template("The following text contains JSON formatted text, but there is an error that you need to correct.\n{format_instructions}\n{question}") ], input_variables=["question"], partial_variables={"format_instructions": format_instructions} ) _input = prompt.format_prompt(question=prompt_redo) # Log token length if running with Gradio try: nt = num_tokens_from_string(_input.to_string(), "cl100k_base") logger.info(f'Prompt Redo token length --- {nt}') except: pass if is_azure: msg = HumanMessage(content=_input.to_string()) output = azure_call(llm, [msg]) else: chat_model = ChatOpenAI(temperature=0, model=MODEL) output = chat_model(_input.to_messages()) try: refined_response = output_parser.parse(output.content) except json.decoder.JSONDecodeError as e: try_ind += 1 error_message = str(e) redo_content = f'The error messsage is: {error_message}\nThe broken JSON object is: {output.content}' logger.info(f'[Failed JSON Object]\n{output.content}') refined_response = structured_output_parser_for_function_calls_fail(is_azure, MODEL, redo_content, logger, llm, prompt_version, is_helper, try_ind) except: try_ind += 1 logger.info(f'[Failed JSON Object]\n{output.content}') refined_response = structured_output_parser_for_function_calls_fail(is_azure, MODEL, output.content, logger, llm, prompt_version, is_helper, try_ind) return refined_response class FunctionSchema: def __init__(self): pass def format_C21_AA_V1(self): return [ { "name": "format_C21_AA_V1", "description": "Format the given data into a specific dictionary", "parameters": { "type": "object", "properties": {}, # specify parameters here if your function requires any "required": [] # list of required parameters }, "output_type": "json", "output_schema": { "type": "object", "properties": { "Dictionary": { "type": "object", "properties": { "Catalog Number": {"type": "array", "items": {"type": "string"}}, "Genus": {"type": "array", "items": {"type": "string"}}, "Species": {"type": "array", "items": {"type": "string"}}, "subspecies": {"type": "array", "items": {"type": "string"}}, "variety": {"type": "array", "items": {"type": "string"}}, "forma": {"type": "array", "items": {"type": "string"}}, "Country": {"type": "array", "items": {"type": "string"}}, "State": {"type": "array", "items": {"type": "string"}}, "County": {"type": "array", "items": {"type": "string"}}, "Locality Name": {"type": "array", "items": {"type": "string"}}, "Min Elevation": {"type": "array", "items": {"type": "string"}}, "Max Elevation": {"type": "array", "items": {"type": "string"}}, "Elevation Units": {"type": "array", "items": {"type": "string"}}, "Verbatim Coordinates": {"type": "array", "items": {"type": "string"}}, "Datum": {"type": "array", "items": {"type": "string"}}, "Cultivated": {"type": "array", "items": {"type": "string"}}, "Habitat": {"type": "array", "items": {"type": "string"}}, "Collectors": {"type": "array", "items": {"type": "string"}}, "Collector Number": {"type": "array", "items": {"type": "string"}}, "Verbatim Date": {"type": "array", "items": {"type": "string"}}, "Date": {"type": "array", "items": {"type": "string"}}, "End Date": {"type": "array", "items": {"type": "string"}} } }, "SpeciesName": { "type": "object", "properties": { "taxonomy": {"type": "array", "items": {"type": "string"}} } } } } } ] def format_C21_AA_V1_helper(self): return [ { "name": "format_C21_AA_V1_helper", "description": "Helper function for format_C21_AA_V1 to further format the given data", "parameters": { "type": "object", "properties": {}, # specify parameters here if your function requires any "required": [] # list of required parameters }, "output_type": "json", "output_schema": { "type": "object", "properties": { "Dictionary": { "type": "object", "properties": { "TAXONOMY": { "type": "object", "properties": { "Order": {"type": "array", "items": {"type": "string"}}, "Family": {"type": "array", "items": {"type": "string"}}, "Genus":{"type": "array", "items": {"type": "string"}}, "Species": {"type": "array", "items": {"type": "string"}}, "Subspecies": {"type": "array", "items": {"type": "string"}}, "Variety": {"type": "array", "items": {"type": "string"}}, "Forma": {"type": "array", "items": {"type": "string"}}, } }, "GEOGRAPHY": { "type": "object", "properties": { "Country": {"type": "array", "items": {"type": "string"}}, "State": {"type": "array", "items": {"type": "string"}}, "Prefecture": {"type": "array", "items": {"type": "string"}}, "Province": {"type": "array", "items": {"type": "string"}}, "District": {"type": "array", "items": {"type": "string"}}, "County": {"type": "array", "items": {"type": "string"}}, "City": {"type": "array", "items": {"type": "string"}}, "Administrative Division": {"type": "array", "items": {"type": "string"}}, } }, "LOCALITY": { "type": "object", "properties": { "Landscape": {"type": "array", "items": {"type": "string"}}, "Nearby Places": {"type": "array", "items": {"type": "string"}}, } }, "COLLECTING": { "type": "object", "properties": { "Collector": {"type": "array", "items": {"type": "string"}}, "Collector's Number": {"type": "array", "items": {"type": "string"}}, "Verbatim Date": {"type": "array", "items": {"type": "string"}}, "Formatted Date": {"type": "array", "items": {"type": "string"}}, "Cultivation Status": {"type": "array", "items": {"type": "string"}}, "Habitat Description": {"type": "array", "items": {"type": "string"}}, } }, "MISCELLANEOUS": { "type": "object", "properties": { "Additional Information": {"type": "array", "items": {"type": "string"}}, } } } }, "Summary": { "type": "object", "properties": { "Content Summary": {"type": "array", "items": {"type": "string"}} } } } } } ]