Spaces:

phyloforfun
/

VoucherVision

Running

App Files Files Community

VoucherVision / vouchervision /LLM_chatGPT_3_5.py

phyloforfun

Add application file

87c3140 9 months ago

raw

history blame

No virus

21 kB

	import openai
	import os, json, sys, inspect, time, requests
	from langchain.output_parsers import StructuredOutputParser, ResponseSchema
	from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
	from langchain.llms import OpenAI
	from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
	from langchain.schema import HumanMessage
	from general_utils import num_tokens_from_string

	currentdir = os.path.dirname(os.path.abspath(
	inspect.getfile(inspect.currentframe())))
	parentdir = os.path.dirname(currentdir)
	sys.path.append(parentdir)

	from prompts import PROMPT_UMICH_skeleton_all_asia, PROMPT_OCR_Organized, PROMPT_UMICH_skeleton_all_asia_GPT4, PROMPT_OCR_Organized_GPT4, PROMPT_JSON
	from prompt_catalog import PromptCatalog

	RETRY_DELAY = 61 # Wait 60 seconds before retrying
	MAX_RETRIES = 5 # Maximum number of retries


	def azure_call(model, messages):
	response = model(messages=messages)
	return response

	def OCR_to_dict(is_azure, logger, MODEL, prompt, llm, prompt_version):
	for i in range(MAX_RETRIES):
	try:
	do_use_SOP = True

	if do_use_SOP:
	logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser')
	response = structured_output_parser(is_azure, MODEL, llm, prompt, logger, prompt_version)
	if response is None:
	return None
	else:
	return response['Dictionary']

	else:
	### Direct GPT ###
	logger.info(f'Waiting for {MODEL} API call')
	if not is_azure:
	response = openai.ChatCompletion.create(
	model=MODEL,
	temperature = 0,
	messages=[
	{"role": "system", "content": "You are a helpful assistant acting as a transcription expert and your job is to transcribe herbarium specimen labels based on OCR data and reformat it to meet Darwin Core Archive Standards into a Python dictionary based on certain rules."},
	{"role": "user", "content": prompt},
	],
	max_tokens=4096,
	)
	# print the model's response
	return response.choices[0].message['content']
	else:
	msg = HumanMessage(
	content=prompt
	)
	response = azure_call(llm, [msg])
	return response.content
	except Exception as e:
	logger.error(f'{e}')
	if i < MAX_RETRIES - 1: # No delay needed after the last try
	time.sleep(RETRY_DELAY)
	else:
	raise

	# def OCR_to_dict(logger, MODEL, prompt, OCR, BASE_URL, HEADERS):
	# for i in range(MAX_RETRIES):
	# try:
	# do_use_SOP = False

	# if do_use_SOP:
	# logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser -- Content')
	# response = structured_output_parser(MODEL, OCR, prompt, logger)
	# if response is None:
	# return None
	# else:
	# return response['Dictionary']

	# else:
	# ### Direct GPT through Azure ###
	# logger.info(f'Waiting for {MODEL} API call')
	# response = azure_gpt_request(prompt, BASE_URL, HEADERS, model_name=MODEL)

	# # Handle the response data. Note: You might need to adjust the following line based on the exact response format of the Azure API.
	# content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
	# return content
	# except requests.exceptions.RequestException as e: # Replace openai.error.APIError with requests exception.
	# # Handle HTTP exceptions. You can adjust this based on the Azure API's error responses.
	# if e.response.status_code == 502:
	# logger.info(f' * 502 error was encountered, wait and try again *')
	# if i < MAX_RETRIES - 1:
	# time.sleep(RETRY_DELAY)
	# else:
	# raise


	def OCR_to_dict_16k(is_azure, logger, MODEL, prompt, llm, prompt_version):
	for i in range(MAX_RETRIES):
	try:
	fs = FunctionSchema()
	response = openai.ChatCompletion.create(
	model=MODEL,
	temperature = 0,
	messages=[
	{"role": "system", "content": "You are a helpful assistant acting as a transcription expert and your job is to transcribe herbarium specimen labels based on OCR data and reformat it to meet Darwin Core Archive Standards into a Python dictionary based on certain rules."},
	{"role": "user", "content": prompt},
	],
	max_tokens=8000,
	function_call= "none",
	functions= fs.format_C21_AA_V1()

	)
	# Try to parse the response into JSON
	call_failed = False
	try:
	response_string = response.choices[0].message['content']
	except:
	call_failed = True
	response_string = prompt

	if not call_failed:
	try:
	# Try to parse the response into JSON
	response_dict = json.loads(response_string)
	return response_dict['Dictionary']
	except json.JSONDecodeError:
	# If the response is not a valid JSON, call the structured_output_parser_for_function_calls_fail function
	logger.info(f'Invalid JSON response, calling structured_output_parser_for_function_calls_fail function')
	logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser --- JSON Fixer')
	response_sop = structured_output_parser_for_function_calls_fail(is_azure, MODEL, response_string, logger, llm, prompt_version, is_helper=False)
	if response_sop is None:
	return None
	else:
	return response_sop['Dictionary']
	else:
	try:
	logger.info(f'Call Failed. Attempting fallback JSON parse without guidance')
	logger.info(f'Waiting for {MODEL} API call --- Using StructuredOutputParser --- JSON Fixer')
	response_sop = structured_output_parser_for_function_calls_fail(is_azure, MODEL, response_string, logger, llm, prompt_version, is_helper=False)
	if response_sop is None:
	return None
	else:
	return response_sop['Dictionary']
	except:
	return None
	except Exception as e:
	# if e.status_code == 401: # or you can check the error message
	logger.info(f' * 401 error was encountered, wait and try again *')
	# If a 401 error was encountered, wait and try again
	if i < MAX_RETRIES - 1: # No delay needed after the last try
	time.sleep(RETRY_DELAY)
	else:
	# If it was a different error, re-raise it
	raise

	def structured_output_parser(is_azure, MODEL, llm, prompt_template, logger, prompt_version, is_helper=False):
	if not is_helper:
	response_schemas = [
	ResponseSchema(name="SpeciesName", description="Taxonomic determination, genus_species"),
	ResponseSchema(name="Dictionary", description='Formatted JSON object'),]#prompt_template),]
	elif is_helper:
	response_schemas = [
	ResponseSchema(name="Dictionary", description='Formatted JSON object'),#prompt_template),
	ResponseSchema(name="Summary", description="A one sentence summary of the content"),]

	output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

	format_instructions = output_parser.get_format_instructions()

	prompt = ChatPromptTemplate(
	messages=[
	HumanMessagePromptTemplate.from_template("Parse the OCR text into the correct structured format.\n{format_instructions}\n{question}")
	],
	input_variables=["question"],
	partial_variables={"format_instructions": format_instructions}
	)

	# Handle Azure vs OpenAI implementation
	if is_azure:
	_input = prompt.format_prompt(question=prompt_template)
	msg = HumanMessage(content=_input.to_string())
	output = azure_call(llm, [msg])
	else:
	chat_model = ChatOpenAI(temperature=0, model=MODEL)
	_input = prompt.format_prompt(question=prompt_template)
	output = chat_model(_input.to_messages())

	# Log token length if running with Gradio
	try:
	nt = num_tokens_from_string(_input.to_string(), "cl100k_base")
	logger.info(f'Prompt token length --- {nt}')
	except:
	pass

	# Parse the output
	try:
	# Check if output is of type 'ai' and parse accordingly
	if output.type == 'ai':
	parsed_content = output.content
	logger.info(f'Formatted JSON\n{parsed_content}')
	else:
	# If not 'ai', log and set parsed_content to None or a default value
	logger.error('Output type is not "ai". Unable to parse.')
	return None

	# Clean up the parsed content
	parsed_content = parsed_content.replace('\n', "").replace('\t', "").replace('\|', "")

	# Attempt to parse the cleaned content
	try:
	refined_response = output_parser.parse(parsed_content)
	return refined_response
	except Exception as parse_error:
	# Handle parsing errors specifically
	logger.error(f'Parsing Error: {parse_error}')
	return structured_output_parser_for_function_calls_fail(is_azure, MODEL, parsed_content, logger, llm, prompt_version, is_helper)

	except Exception as e:
	# Handle any other exceptions that might occur
	logger.error(f'Unexpected Error: {e}')
	return None

	def structured_output_parser_for_function_calls_fail(is_azure, MODEL, failed_response, logger, llm, prompt_version, is_helper=False, try_ind=0):
	if try_ind > 5:
	return None

	# prompt_redo = PROMPT_JSON('helper' if is_helper else 'dict', failed_response)
	Prompt = PromptCatalog()
	if prompt_version in ['prompt_v1_verbose', 'prompt_v1_verbose_noDomainKnowledge']:
	prompt_redo = Prompt.prompt_gpt_redo_v1(failed_response)
	elif prompt_version in ['prompt_v2_json_rules']:
	prompt_redo = Prompt.prompt_gpt_redo_v2(failed_response)
	else:
	prompt_redo = Prompt.prompt_v2_custom_redo(failed_response, is_palm=False)

	response_schemas = [
	ResponseSchema(name="Summary", description="A one sentence summary of the content"),
	ResponseSchema(name="Dictionary", description='Formatted JSON object')
	]

	output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
	format_instructions = output_parser.get_format_instructions()

	prompt = ChatPromptTemplate(
	messages=[
	HumanMessagePromptTemplate.from_template("The following text contains JSON formatted text, but there is an error that you need to correct.\n{format_instructions}\n{question}")
	],
	input_variables=["question"],
	partial_variables={"format_instructions": format_instructions}
	)

	_input = prompt.format_prompt(question=prompt_redo)

	# Log token length if running with Gradio
	try:
	nt = num_tokens_from_string(_input.to_string(), "cl100k_base")
	logger.info(f'Prompt Redo token length --- {nt}')
	except:
	pass

	if is_azure:
	msg = HumanMessage(content=_input.to_string())
	output = azure_call(llm, [msg])
	else:
	chat_model = ChatOpenAI(temperature=0, model=MODEL)
	output = chat_model(_input.to_messages())

	try:
	refined_response = output_parser.parse(output.content)
	except json.decoder.JSONDecodeError as e:
	try_ind += 1
	error_message = str(e)
	redo_content = f'The error messsage is: {error_message}\nThe broken JSON object is: {output.content}'
	logger.info(f'[Failed JSON Object]\n{output.content}')
	refined_response = structured_output_parser_for_function_calls_fail(is_azure, MODEL, redo_content, logger, llm, prompt_version, is_helper, try_ind)
	except:
	try_ind += 1
	logger.info(f'[Failed JSON Object]\n{output.content}')
	refined_response = structured_output_parser_for_function_calls_fail(is_azure, MODEL, output.content, logger, llm, prompt_version, is_helper, try_ind)

	return refined_response




	class FunctionSchema:
	def __init__(self):
	pass

	def format_C21_AA_V1(self):
	return [
	{
	"name": "format_C21_AA_V1",
	"description": "Format the given data into a specific dictionary",
	"parameters": {
	"type": "object",
	"properties": {}, # specify parameters here if your function requires any
	"required": [] # list of required parameters
	},
	"output_type": "json",
	"output_schema": {
	"type": "object",
	"properties": {
	"Dictionary": {
	"type": "object",
	"properties": {
	"Catalog Number": {"type": "array", "items": {"type": "string"}},
	"Genus": {"type": "array", "items": {"type": "string"}},
	"Species": {"type": "array", "items": {"type": "string"}},
	"subspecies": {"type": "array", "items": {"type": "string"}},
	"variety": {"type": "array", "items": {"type": "string"}},
	"forma": {"type": "array", "items": {"type": "string"}},
	"Country": {"type": "array", "items": {"type": "string"}},
	"State": {"type": "array", "items": {"type": "string"}},
	"County": {"type": "array", "items": {"type": "string"}},
	"Locality Name": {"type": "array", "items": {"type": "string"}},
	"Min Elevation": {"type": "array", "items": {"type": "string"}},
	"Max Elevation": {"type": "array", "items": {"type": "string"}},
	"Elevation Units": {"type": "array", "items": {"type": "string"}},
	"Verbatim Coordinates": {"type": "array", "items": {"type": "string"}},
	"Datum": {"type": "array", "items": {"type": "string"}},
	"Cultivated": {"type": "array", "items": {"type": "string"}},
	"Habitat": {"type": "array", "items": {"type": "string"}},
	"Collectors": {"type": "array", "items": {"type": "string"}},
	"Collector Number": {"type": "array", "items": {"type": "string"}},
	"Verbatim Date": {"type": "array", "items": {"type": "string"}},
	"Date": {"type": "array", "items": {"type": "string"}},
	"End Date": {"type": "array", "items": {"type": "string"}}
	}
	},
	"SpeciesName": {
	"type": "object",
	"properties": {
	"taxonomy": {"type": "array", "items": {"type": "string"}}
	}
	}
	}
	}
	}
	]

	def format_C21_AA_V1_helper(self):
	return [
	{
	"name": "format_C21_AA_V1_helper",
	"description": "Helper function for format_C21_AA_V1 to further format the given data",
	"parameters": {
	"type": "object",
	"properties": {}, # specify parameters here if your function requires any
	"required": [] # list of required parameters
	},
	"output_type": "json",
	"output_schema": {
	"type": "object",
	"properties": {
	"Dictionary": {
	"type": "object",
	"properties": {
	"TAXONOMY": {
	"type": "object",
	"properties": {
	"Order": {"type": "array", "items": {"type": "string"}},
	"Family": {"type": "array", "items": {"type": "string"}},
	"Genus":{"type": "array", "items": {"type": "string"}},
	"Species": {"type": "array", "items": {"type": "string"}},
	"Subspecies": {"type": "array", "items": {"type": "string"}},
	"Variety": {"type": "array", "items": {"type": "string"}},
	"Forma": {"type": "array", "items": {"type": "string"}},
	}
	},
	"GEOGRAPHY": {
	"type": "object",
	"properties": {
	"Country": {"type": "array", "items": {"type": "string"}},
	"State": {"type": "array", "items": {"type": "string"}},
	"Prefecture": {"type": "array", "items": {"type": "string"}},
	"Province": {"type": "array", "items": {"type": "string"}},
	"District": {"type": "array", "items": {"type": "string"}},
	"County": {"type": "array", "items": {"type": "string"}},
	"City": {"type": "array", "items": {"type": "string"}},
	"Administrative Division": {"type": "array", "items": {"type": "string"}},
	}
	},
	"LOCALITY": {
	"type": "object",
	"properties": {
	"Landscape": {"type": "array", "items": {"type": "string"}},
	"Nearby Places": {"type": "array", "items": {"type": "string"}},
	}
	},
	"COLLECTING": {
	"type": "object",
	"properties": {
	"Collector": {"type": "array", "items": {"type": "string"}},
	"Collector's Number": {"type": "array", "items": {"type": "string"}},
	"Verbatim Date": {"type": "array", "items": {"type": "string"}},
	"Formatted Date": {"type": "array", "items": {"type": "string"}},
	"Cultivation Status": {"type": "array", "items": {"type": "string"}},
	"Habitat Description": {"type": "array", "items": {"type": "string"}},
	}
	},
	"MISCELLANEOUS": {
	"type": "object",
	"properties": {
	"Additional Information": {"type": "array", "items": {"type": "string"}},
	}
	}
	}
	},
	"Summary": {
	"type": "object",
	"properties": {
	"Content Summary": {"type": "array", "items": {"type": "string"}}
	}
	}
	}
	}
	}
	]