entity_extraction / helpers /entity_extraction_helpers.py
adit94's picture
Update helpers/entity_extraction_helpers.py
ac9589b verified
raw
history blame
5.03 kB
from constants import DOCUMENT_COLLECTION
from openai_constants import ENTITY_EXTRACTION_PROMPT, ENTITY_EXTRACTION_FUNCTION, GPT35_PRICING
def extract_all_documents(openai_instance, chunks):
all_entities = {}
all_usage = {}
total_prompt_tokens = 0
total_completion_tokens = 0
print(f"Number of chunks to process :: {len(chunks)}")
for chunk_idx, chunk in enumerate(chunks):
print(f"Sending request to OpenAI for {chunk_idx}")
openai_entities_out = openai_instance.generate_response(ENTITY_EXTRACTION_PROMPT, chunk, ENTITY_EXTRACTION_FUNCTION)
print("OpenAI out received")
print(openai_entities_out['function_output'])
#for ent in openai_entities_out['function_output'].items():
for key, val in openai_entities_out['function_output'].items():
print(key, val)
if key in all_entities:
if isinstance(val, list):
all_entities[key].extend(val) # Extend the existing list with the new list
else:
all_entities[key].append(val) # Append the value to the existing list
else:
if isinstance(val, list):
all_entities[key] = val # Initialize the key with the list
else:
all_entities[key] = [val]
if 'prompt_tokens' in openai_entities_out['usage']:
total_prompt_tokens += openai_entities_out['usage']['prompt_tokens']
if 'completion_tokens' in openai_entities_out['usage']:
total_completion_tokens += openai_entities_out['usage']['completion_tokens']
all_usage = {
'prompt_tokens':total_prompt_tokens,
'completion_tokens':total_completion_tokens,
'output_pricing': total_completion_tokens/1000 * GPT35_PRICING['input'],
'input_pricing':total_prompt_tokens/1000 * GPT35_PRICING['output']
}
return all_entities, all_usage
def process_insurance_document(pii_instance, mongo_instance, openai_instance, ocr_instance,
document_path, document_id):
print("---- \nInside Process insurance document function")
## save file to S3
document_s3_url = ""
## OCR
try:
#document_text = ocr_instance.extract_text_from_document(document_path)
document_text = ocr_instance.llama_parse_ocr(document_path)
ocr_status = "Completed"
process_status = "OCR Completed"
print(f"OCR complete")
except Exception as ex:
document_text = ""
ocr_status = ex
process_status = f"OCR Failed. {ex}"
print(process_status)
## save ocr file to S3, add document S3 url
ocr_document_s3_url = ""
## update ocr_status in db
#mongo_instance.update(DOCUMENT_COLLECTION,
# {'document_id':document_id},
# {'set':{'ocr_status':ocr_status, 'document_s3_url':document_s3_url,
# 'ocr_document_s3_url':ocr_document_s3_url, 'process_status':process_status}})
print(f"OCR status updated in db")
## PII entity extraction and masking
pii_entities = pii_instance.identify(document_text)
print(f"pii entiites are :: {pii_entities}")
pii_entities = pii_instance.add_mask(pii_entities)
print(f"\npii_entities after adding mask :: {pii_entities}")
masked_text = pii_instance.anonymize(pii_entities, document_text)
print(f"\nPII anonumized text is :: {masked_text}")
print(f"\nPII complete")
## Openai extraction
chunks = ocr_instance.chunk_document(masked_text)
openai_entities, all_usage = extract_all_documents(openai_instance, chunks)
entity_extraction_status = 'Completed'
process_status = 'Document term extraction completed'
"""try:
openai_entities, all_usage = extract_all_documents(openai_instance, chunks)
entity_extraction_status = 'Completed'
process_status = 'Document term extraction completed'
except Exception as ex:
openai_entities = {}
all_usage = {}
entity_extraction_status = ex
process_status = f"Document term extraction failed. {ex}"
"""
#openai_entities_out = {
# 'status':"Success",
# 'function_output':{},
# 'usage':{}
#}
print(f"openai_entities are :: {openai_entities}")
print(f"Request to OpenAI complete")
print("----------- \nProcessing complete\n ")
## Unmask PII entities in openai entities
## update entity extraction status in db
#mongo_instance.update(DOCUMENT_COLLECTION,
# {'document_id':document_id},
# {'set':{'entity_extraction_status':entity_extraction_status,
# 'entities':openai_entities, 'process_status':process_status}})
#print(f"Entities updated in DB")
out = {
"entities":openai_entities,
"masked_text":masked_text,
"masked_entities":pii_entities
}
return out