File size: 3,242 Bytes
09cb3f5
 
 
 
1612f56
 
 
 
 
 
 
c796de9
1612f56
 
 
 
 
09cb3f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1612f56
 
09cb3f5
1612f56
 
 
 
 
09cb3f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1612f56
 
09cb3f5
 
 
1612f56
 
 
 
09cb3f5
1612f56
 
09cb3f5
1612f56
 
 
 
09cb3f5
 
 
1612f56
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import chromadb
from chromadb.utils import embedding_functions


def create_client():
    client = chromadb.PersistentClient(path="./chromadb_linux/")
    MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
    COLLECTION_NAME: str = "schemer2"
    EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=MODEL_NAME
    )
    schemer = client.create_collection(
        name=COLLECTION_NAME,
        embedding_function=EMBEDDING_FUNC,
    )
    return schemer

def get_client():
    client = chromadb.PersistentClient(path="./chromadb_linux/")
    MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
    COLLECTION_NAME: str = "scheme"
    EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=MODEL_NAME
    )
    schemer = client.get_collection(
        name=COLLECTION_NAME,
        embedding_function=EMBEDDING_FUNC,
    )
    return schemer


def update_collection(iter: int, text: object, client: chromadb.Collection):
    client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])


def encode_image(image) -> str:
    import io
    import base64
    
    byte_arr = io.BytesIO()
    image.save(byte_arr, format="JPEG")
    encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
    return encoded_image


async def image_to_text(image) -> object:
    from openai import OpenAI
    import json

    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4-turbo",
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64;,{image}",
                            "detail": "high",
                        },
                    },
                ],
            }
        ],
    )
    return json.loads(response.choices[0].message.content)


async def start_troggin_off(dir: str, client):
    # recursive
    import os
    from pdf2image import convert_from_path

    dirs = os.listdir(dir)
    for path in dirs:
        if os.path.isdir(os.path.join(dir, path)):
            await start_troggin_off(os.path.join(dir, path), client) # recursive call

        if(os.path.join(dir, path).endswith(".pdf")):
            images = convert_from_path(os.path.join(dir, path))

            for i, image in enumerate(images):
                encoded_image = encode_image(image)
                text = await image_to_text(encoded_image)
                update_collection(i, text, client)

if __name__ == "__main__":
    import asyncio
    client = create_client()
    # client = None
    asyncio.run(start_troggin_off("data/", client))