|
import pandas as pd |
|
import base64 |
|
|
|
from img2table.document import Image |
|
from img2table.ocr import DocTR |
|
from langchain.schema.messages import HumanMessage, AIMessage |
|
from langchain_experimental.agents import create_pandas_dataframe_agent |
|
from langchain_openai import ChatOpenAI |
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
path = '../NutriGenMe-Testing/ukmss-1.png' |
|
|
|
vision = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=4096) |
|
|
|
img = encode_image(path) |
|
msg = vision.invoke( |
|
[ |
|
AIMessage(content="You are an experienced doctor specializing in genomics and want to identify names of genes, SNPs, and their related diseases based on the tables given."), |
|
HumanMessage( |
|
content=[ |
|
{ "type": "text", |
|
"text": 'You will be provided with the image of a table. Extract all genes / locus names with its respective rsID / SNP and potential diseases in curly brackets like this: {"Genes" : "", "SNPs" : "", "Diseases" : ""}.' |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{img}", |
|
"detail": "low" |
|
}, |
|
}, |
|
] |
|
) |
|
] |
|
) |
|
|
|
print(msg.content) |
|
|
|
|
|
|
|
image = Image(path) |
|
ocr = DocTR() |
|
|
|
extracted_tables = image.extract_tables(ocr=ocr, |
|
implicit_rows=True, |
|
borderless_tables=True, |
|
min_confidence=0) |
|
|
|
df = extracted_tables[0].df |
|
for et in extracted_tables[1:]: |
|
df = pd.concat([df, et.df]).reset_index(drop=True) |
|
|
|
print(df) |
|
|
|
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) |
|
|
|
agent = create_pandas_dataframe_agent(llm, df, verbose=True) |
|
agent_output = agent.invoke("Is this table contain Gene names?") |
|
print(agent_output) |