File size: 2,050 Bytes
fb4710e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import base64

from img2table.document import Image
from img2table.ocr import DocTR
from langchain.schema.messages import HumanMessage, AIMessage
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

path = '../NutriGenMe-Testing/ukmss-1.png'

vision = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=4096)

img = encode_image(path)
msg = vision.invoke(
    [
        AIMessage(content="You are an experienced doctor specializing in genomics and want to identify names of genes, SNPs, and their related diseases based on the tables given."),
        HumanMessage(
            content=[
                {   "type": "text", 
                    "text": 'You will be provided with the image of a table. Extract all genes / locus names with its respective rsID / SNP and potential diseases in curly brackets like this: {"Genes" : "", "SNPs" : "", "Diseases" : ""}.'
                },
                {   
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{img}",
                        "detail": "low"
                    },
                },
            ]
        )
    ]
)

print(msg.content)

# exit()

image = Image(path)
ocr = DocTR()

extracted_tables = image.extract_tables(ocr=ocr,
                                      implicit_rows=True,
                                      borderless_tables=True,
                                      min_confidence=0)

df = extracted_tables[0].df
for et in extracted_tables[1:]:
    df = pd.concat([df, et.df]).reset_index(drop=True)

print(df)

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

agent = create_pandas_dataframe_agent(llm, df, verbose=True)
agent_output = agent.invoke("Is this table contain Gene names?")
print(agent_output)