Spaces:

phyloforfun
/

VoucherVision

Running

File size: 21,018 Bytes

87c3140

import openai
import os, json, sys, inspect, time, requests
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.schema import HumanMessage
from general_utils import num_tokens_from_string

currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from prompts import PROMPT_UMICH_skeleton_all_asia, PROMPT_OCR_Organized, PROMPT_UMICH_skeleton_all_asia_GPT4, PROMPT_OCR_Organized_GPT4, PROMPT_JSON
from prompt_catalog import PromptCatalog

RETRY_DELAY = 61  # Wait 60 seconds before retrying
MAX_RETRIES = 5  # Maximum number of retries


def azure_call(model, messages):
    response = model(messages=messages)
    return response

def OCR_to_dict(is_azure, logger, MODEL, prompt, llm, prompt_version):
    for i in range(MAX_RETRIES):
        try:
            do_use_SOP = True

            if do_use_SOP:
                logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser')
                response = structured_output_parser(is_azure, MODEL, llm, prompt, logger, prompt_version)
                if response is None:
                    return None
                else:
                    return response['Dictionary']

            else:
                ### Direct GPT ###
                logger.info(f'Waiting for {MODEL} API call')
                if not is_azure:
                    response = openai.ChatCompletion.create(
                        model=MODEL,
                        temperature = 0,
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant acting as a transcription expert and your job is to transcribe herbarium specimen labels based on OCR data and reformat it to meet Darwin Core Archive Standards into a Python dictionary based on certain rules."},
                            {"role": "user", "content": prompt},
                        ],
                        max_tokens=4096,
                    )
                    # print the model's response
                    return response.choices[0].message['content']
                else:
                    msg = HumanMessage(
                        content=prompt
                    )
                    response = azure_call(llm, [msg])
                    return response.content
        except Exception as e:
            logger.error(f'{e}')
            if i < MAX_RETRIES - 1:  # No delay needed after the last try
                time.sleep(RETRY_DELAY)
            else:
                raise

# def OCR_to_dict(logger, MODEL, prompt, OCR, BASE_URL, HEADERS):
#     for i in range(MAX_RETRIES):
#         try:
#             do_use_SOP = False

#             if do_use_SOP:
#                 logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser -- Content')
#                 response = structured_output_parser(MODEL, OCR, prompt, logger)
#                 if response is None:
#                     return None
#                 else:
#                     return response['Dictionary']

#             else:
#                 ### Direct GPT through Azure ###
#                 logger.info(f'Waiting for {MODEL} API call')
#                 response = azure_gpt_request(prompt, BASE_URL, HEADERS, model_name=MODEL)

#                 # Handle the response data. Note: You might need to adjust the following line based on the exact response format of the Azure API.
#                 content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
#                 return content
#         except requests.exceptions.RequestException as e:  # Replace openai.error.APIError with requests exception.
#             # Handle HTTP exceptions. You can adjust this based on the Azure API's error responses.
#             if e.response.status_code == 502:
#                 logger.info(f'   ***    502 error was encountered, wait and try again   ***')
#                 if i < MAX_RETRIES - 1:
#                     time.sleep(RETRY_DELAY)
#             else:
#                 raise


def OCR_to_dict_16k(is_azure, logger, MODEL, prompt, llm, prompt_version):
    for i in range(MAX_RETRIES):
        try:
            fs = FunctionSchema()
            response = openai.ChatCompletion.create(
                model=MODEL,
                temperature = 0,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant acting as a transcription expert and your job is to transcribe herbarium specimen labels based on OCR data and reformat it to meet Darwin Core Archive Standards into a Python dictionary based on certain rules."},
                    {"role": "user", "content": prompt},
                ],
                max_tokens=8000,
                function_call= "none",
                functions= fs.format_C21_AA_V1()

            )
            # Try to parse the response into JSON
            call_failed = False
            try:
                response_string = response.choices[0].message['content']
            except:
                call_failed = True
                response_string = prompt

            if not call_failed:
                try:
                    # Try to parse the response into JSON
                    response_dict = json.loads(response_string)
                    return response_dict['Dictionary']
                except json.JSONDecodeError:
                    # If the response is not a valid JSON, call the structured_output_parser_for_function_calls_fail function
                    logger.info(f'Invalid JSON response, calling structured_output_parser_for_function_calls_fail function')
                    logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser --- JSON Fixer')
                    response_sop = structured_output_parser_for_function_calls_fail(is_azure, MODEL, response_string, logger, llm, prompt_version, is_helper=False)
                    if response_sop is None:
                        return None
                    else:
                        return response_sop['Dictionary']
            else:
                try:
                    logger.info(f'Call Failed. Attempting fallback JSON parse without guidance')
                    logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser --- JSON Fixer')
                    response_sop = structured_output_parser_for_function_calls_fail(is_azure, MODEL, response_string, logger, llm, prompt_version, is_helper=False)
                    if response_sop is None:
                        return None
                    else:
                        return response_sop['Dictionary']
                except:
                    return None
        except Exception as e:
            # if e.status_code == 401: # or you can check the error message
            logger.info(f'   ***    401 error was encountered, wait and try again   ***')
            # If a 401 error was encountered, wait and try again
            if i < MAX_RETRIES - 1:  # No delay needed after the last try
                time.sleep(RETRY_DELAY)
            else:
                # If it was a different error, re-raise it
                raise
            
def structured_output_parser(is_azure, MODEL, llm, prompt_template, logger, prompt_version, is_helper=False):
    if not is_helper:
        response_schemas = [
            ResponseSchema(name="SpeciesName", description="Taxonomic determination, genus_species"),
            ResponseSchema(name="Dictionary", description='Formatted JSON object'),]#prompt_template),]
    elif is_helper:
        response_schemas = [
            ResponseSchema(name="Dictionary", description='Formatted JSON object'),#prompt_template),
            ResponseSchema(name="Summary", description="A one sentence summary of the content"),]
        
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

    format_instructions = output_parser.get_format_instructions()

    prompt = ChatPromptTemplate(
        messages=[
            HumanMessagePromptTemplate.from_template("Parse the OCR text into the correct structured format.\n{format_instructions}\n{question}")  
        ],
        input_variables=["question"],
        partial_variables={"format_instructions": format_instructions}
    )

    # Handle Azure vs OpenAI implementation
    if is_azure:
        _input = prompt.format_prompt(question=prompt_template)
        msg = HumanMessage(content=_input.to_string())
        output = azure_call(llm, [msg])
    else:
        chat_model = ChatOpenAI(temperature=0, model=MODEL)
        _input = prompt.format_prompt(question=prompt_template)
        output = chat_model(_input.to_messages())

    # Log token length if running with Gradio
    try:
        nt = num_tokens_from_string(_input.to_string(), "cl100k_base")
        logger.info(f'Prompt token length --- {nt}')
    except:
        pass

    # Parse the output
    try:
        # Check if output is of type 'ai' and parse accordingly
        if output.type == 'ai':
            parsed_content = output.content
            logger.info(f'Formatted JSON\n{parsed_content}')
        else:
            # If not 'ai', log and set parsed_content to None or a default value
            logger.error('Output type is not "ai". Unable to parse.')
            return None

        # Clean up the parsed content
        parsed_content = parsed_content.replace('\n', "").replace('\t', "").replace('|', "")

        # Attempt to parse the cleaned content
        try:
            refined_response = output_parser.parse(parsed_content)
            return refined_response
        except Exception as parse_error:
            # Handle parsing errors specifically
            logger.error(f'Parsing Error: {parse_error}')
            return structured_output_parser_for_function_calls_fail(is_azure, MODEL, parsed_content, logger, llm, prompt_version, is_helper)

    except Exception as e:
        # Handle any other exceptions that might occur
        logger.error(f'Unexpected Error: {e}')
        return None

def structured_output_parser_for_function_calls_fail(is_azure, MODEL, failed_response, logger, llm, prompt_version, is_helper=False, try_ind=0):
    if try_ind > 5:
        return None

    # prompt_redo = PROMPT_JSON('helper' if is_helper else 'dict', failed_response)
    Prompt = PromptCatalog()
    if prompt_version in ['prompt_v1_verbose', 'prompt_v1_verbose_noDomainKnowledge']:
        prompt_redo = Prompt.prompt_gpt_redo_v1(failed_response)
    elif prompt_version in ['prompt_v2_json_rules']:
        prompt_redo = Prompt.prompt_gpt_redo_v2(failed_response)
    else:
        prompt_redo = Prompt.prompt_v2_custom_redo(failed_response, is_palm=False)

    response_schemas = [
        ResponseSchema(name="Summary", description="A one sentence summary of the content"),
        ResponseSchema(name="Dictionary", description='Formatted JSON object')
    ]

    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    prompt = ChatPromptTemplate(
        messages=[
            HumanMessagePromptTemplate.from_template("The following text contains JSON formatted text, but there is an error that you need to correct.\n{format_instructions}\n{question}")
        ],
        input_variables=["question"],
        partial_variables={"format_instructions": format_instructions}
    )

    _input = prompt.format_prompt(question=prompt_redo)

    # Log token length if running with Gradio
    try:
        nt = num_tokens_from_string(_input.to_string(), "cl100k_base")
        logger.info(f'Prompt Redo token length --- {nt}')
    except:
        pass

    if is_azure:
        msg = HumanMessage(content=_input.to_string())
        output = azure_call(llm, [msg])
    else:
        chat_model = ChatOpenAI(temperature=0, model=MODEL)
        output = chat_model(_input.to_messages())

    try:
        refined_response = output_parser.parse(output.content)
    except json.decoder.JSONDecodeError as e:
        try_ind += 1
        error_message = str(e)
        redo_content = f'The error messsage is: {error_message}\nThe broken JSON object is: {output.content}'
        logger.info(f'[Failed JSON Object]\n{output.content}')
        refined_response = structured_output_parser_for_function_calls_fail(is_azure, MODEL, redo_content, logger, llm, prompt_version, is_helper, try_ind)
    except:
        try_ind += 1
        logger.info(f'[Failed JSON Object]\n{output.content}')
        refined_response = structured_output_parser_for_function_calls_fail(is_azure, MODEL, output.content, logger, llm, prompt_version, is_helper, try_ind)

    return refined_response




class FunctionSchema:
    def __init__(self):
        pass

    def format_C21_AA_V1(self):
        return [
            {
                "name": "format_C21_AA_V1",
                "description": "Format the given data into a specific dictionary",
                "parameters": {
                    "type": "object",
                    "properties": {},  # specify parameters here if your function requires any
                    "required": []  # list of required parameters
                },
                "output_type": "json",
                "output_schema": {
                    "type": "object",
                    "properties": {
                        "Dictionary": {
                            "type": "object",
                            "properties": {
                                "Catalog Number": {"type": "array", "items": {"type": "string"}},
                                "Genus": {"type": "array", "items": {"type": "string"}},
                                "Species": {"type": "array", "items": {"type": "string"}},
                                "subspecies": {"type": "array", "items": {"type": "string"}},
                                "variety": {"type": "array", "items": {"type": "string"}},
                                "forma": {"type": "array", "items": {"type": "string"}},
                                "Country": {"type": "array", "items": {"type": "string"}},
                                "State": {"type": "array", "items": {"type": "string"}},
                                "County": {"type": "array", "items": {"type": "string"}},
                                "Locality Name": {"type": "array", "items": {"type": "string"}},
                                "Min Elevation": {"type": "array", "items": {"type": "string"}},
                                "Max Elevation": {"type": "array", "items": {"type": "string"}},
                                "Elevation Units": {"type": "array", "items": {"type": "string"}},
                                "Verbatim Coordinates": {"type": "array", "items": {"type": "string"}},
                                "Datum": {"type": "array", "items": {"type": "string"}},
                                "Cultivated": {"type": "array", "items": {"type": "string"}},
                                "Habitat": {"type": "array", "items": {"type": "string"}},
                                "Collectors": {"type": "array", "items": {"type": "string"}},
                                "Collector Number": {"type": "array", "items": {"type": "string"}},
                                "Verbatim Date": {"type": "array", "items": {"type": "string"}},
                                "Date": {"type": "array", "items": {"type": "string"}},
                                "End Date": {"type": "array", "items": {"type": "string"}}
                            }
                        },
                        "SpeciesName": {
                            "type": "object",
                            "properties": {
                                "taxonomy": {"type": "array", "items": {"type": "string"}}
                            }
                        }
                    }
                }
            }
        ]

    def format_C21_AA_V1_helper(self):
        return [
            {
                "name": "format_C21_AA_V1_helper",
                "description": "Helper function for format_C21_AA_V1 to further format the given data",
                "parameters": {
                    "type": "object",
                    "properties": {},  # specify parameters here if your function requires any
                    "required": []  # list of required parameters
                },
                "output_type": "json",
                "output_schema": {
                    "type": "object",
                    "properties": {
                        "Dictionary": {
                            "type": "object",
                            "properties": {
                                "TAXONOMY": {
                                    "type": "object",
                                    "properties": {
                                        "Order": {"type": "array", "items": {"type": "string"}},
                                        "Family": {"type": "array", "items": {"type": "string"}},
                                        "Genus":{"type": "array", "items": {"type": "string"}},
                                        "Species": {"type": "array", "items": {"type": "string"}},
                                        "Subspecies": {"type": "array", "items": {"type": "string"}},
                                        "Variety": {"type": "array", "items": {"type": "string"}},
                                        "Forma": {"type": "array", "items": {"type": "string"}},
                                    }
                                },
                                "GEOGRAPHY": {
                                    "type": "object",
                                    "properties": {
                                        "Country": {"type": "array", "items": {"type": "string"}},
                                        "State": {"type": "array", "items": {"type": "string"}},
                                        "Prefecture": {"type": "array", "items": {"type": "string"}},
                                        "Province": {"type": "array", "items": {"type": "string"}},
                                        "District": {"type": "array", "items": {"type": "string"}},
                                        "County": {"type": "array", "items": {"type": "string"}},
                                        "City": {"type": "array", "items": {"type": "string"}},
                                        "Administrative Division": {"type": "array", "items": {"type": "string"}},
                                    }
                                },
                                "LOCALITY": {
                                    "type": "object",
                                    "properties": {
                                        "Landscape": {"type": "array", "items": {"type": "string"}},
                                        "Nearby Places": {"type": "array", "items": {"type": "string"}},
                                    }
                                },
                                "COLLECTING": {
                                    "type": "object",
                                    "properties": {
                                        "Collector": {"type": "array", "items": {"type": "string"}},
                                        "Collector's Number": {"type": "array", "items": {"type": "string"}},
                                        "Verbatim Date": {"type": "array", "items": {"type": "string"}},
                                        "Formatted Date": {"type": "array", "items": {"type": "string"}},
                                        "Cultivation Status": {"type": "array", "items": {"type": "string"}},
                                        "Habitat Description": {"type": "array", "items": {"type": "string"}},
                                    }
                                },
                                "MISCELLANEOUS": {
                                    "type": "object",
                                    "properties": {
                                        "Additional Information": {"type": "array", "items": {"type": "string"}},
                                    }
                                }
                            }
                        },
                        "Summary": {
                            "type": "object",
                            "properties": {
                                "Content Summary": {"type": "array", "items": {"type": "string"}}
                            }
                        }
                    }
                }
            }
        ]