Spaces:
Sleeping
Sleeping
from constants import DOCUMENT_COLLECTION | |
from openai_constants import ENTITY_EXTRACTION_PROMPT, ENTITY_EXTRACTION_FUNCTION, GPT35_PRICING | |
def extract_all_documents(openai_instance, chunks): | |
all_entities = {} | |
all_usage = {} | |
total_prompt_tokens = 0 | |
total_completion_tokens = 0 | |
print(f"Number of chunks to process :: {len(chunks)}") | |
for chunk_idx, chunk in enumerate(chunks): | |
print(f"Sending request to OpenAI for {chunk_idx}") | |
openai_entities_out = openai_instance.generate_response(ENTITY_EXTRACTION_PROMPT, chunk, ENTITY_EXTRACTION_FUNCTION) | |
print("OpenAI out received") | |
print(openai_entities_out['function_output']) | |
#for ent in openai_entities_out['function_output'].items(): | |
for key, val in openai_entities_out['function_output'].items(): | |
print(key, val) | |
if key in all_entities: | |
if isinstance(val, list): | |
all_entities[key].extend(val) # Extend the existing list with the new list | |
else: | |
all_entities[key].append(val) # Append the value to the existing list | |
else: | |
if isinstance(val, list): | |
all_entities[key] = val # Initialize the key with the list | |
else: | |
all_entities[key] = [val] | |
if 'prompt_tokens' in openai_entities_out['usage']: | |
total_prompt_tokens += openai_entities_out['usage']['prompt_tokens'] | |
if 'completion_tokens' in openai_entities_out['usage']: | |
total_completion_tokens += openai_entities_out['usage']['completion_tokens'] | |
all_usage = { | |
'prompt_tokens':total_prompt_tokens, | |
'completion_tokens':total_completion_tokens, | |
'output_pricing': total_completion_tokens/1000 * GPT35_PRICING['input'], | |
'input_pricing':total_prompt_tokens/1000 * GPT35_PRICING['output'] | |
} | |
return all_entities, all_usage | |
def process_insurance_document(pii_instance, mongo_instance, openai_instance, ocr_instance, | |
document_path, document_id): | |
print("---- \nInside Process insurance document function") | |
## save file to S3 | |
document_s3_url = "" | |
## OCR | |
try: | |
#document_text = ocr_instance.extract_text_from_document(document_path) | |
document_text = ocr_instance.llama_parse_ocr(document_path) | |
ocr_status = "Completed" | |
process_status = "OCR Completed" | |
print(f"OCR complete") | |
except Exception as ex: | |
document_text = "" | |
ocr_status = ex | |
process_status = f"OCR Failed. {ex}" | |
print(process_status) | |
## save ocr file to S3, add document S3 url | |
ocr_document_s3_url = "" | |
## update ocr_status in db | |
#mongo_instance.update(DOCUMENT_COLLECTION, | |
# {'document_id':document_id}, | |
# {'set':{'ocr_status':ocr_status, 'document_s3_url':document_s3_url, | |
# 'ocr_document_s3_url':ocr_document_s3_url, 'process_status':process_status}}) | |
print(f"OCR status updated in db") | |
## PII entity extraction and masking | |
pii_entities = pii_instance.identify(document_text) | |
print(f"pii entiites are :: {pii_entities}") | |
pii_entities = pii_instance.add_mask(pii_entities) | |
print(f"\npii_entities after adding mask :: {pii_entities}") | |
masked_text = pii_instance.anonymize(pii_entities, document_text) | |
print(f"\nPII anonumized text is :: {masked_text}") | |
print(f"\nPII complete") | |
## Openai extraction | |
chunks = ocr_instance.chunk_document(masked_text) | |
openai_entities, all_usage = extract_all_documents(openai_instance, chunks) | |
entity_extraction_status = 'Completed' | |
process_status = 'Document term extraction completed' | |
"""try: | |
openai_entities, all_usage = extract_all_documents(openai_instance, chunks) | |
entity_extraction_status = 'Completed' | |
process_status = 'Document term extraction completed' | |
except Exception as ex: | |
openai_entities = {} | |
all_usage = {} | |
entity_extraction_status = ex | |
process_status = f"Document term extraction failed. {ex}" | |
""" | |
#openai_entities_out = { | |
# 'status':"Success", | |
# 'function_output':{}, | |
# 'usage':{} | |
#} | |
print(f"openai_entities are :: {openai_entities}") | |
print(f"Request to OpenAI complete") | |
print("----------- \nProcessing complete\n ") | |
## Unmask PII entities in openai entities | |
## update entity extraction status in db | |
#mongo_instance.update(DOCUMENT_COLLECTION, | |
# {'document_id':document_id}, | |
# {'set':{'entity_extraction_status':entity_extraction_status, | |
# 'entities':openai_entities, 'process_status':process_status}}) | |
#print(f"Entities updated in DB") | |
out = { | |
"entities":openai_entities, | |
"masked_text":masked_text, | |
"masked_entities":pii_entities | |
} | |
return out |