import chromadb from chromadb.utils import embedding_functions def create_client(): client = chromadb.PersistentClient(path="./chromadb_linux/") MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb COLLECTION_NAME: str = "schemer2" EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction( model_name=MODEL_NAME ) schemer = client.create_collection( name=COLLECTION_NAME, embedding_function=EMBEDDING_FUNC, ) return schemer def get_client(): client = chromadb.PersistentClient(path="./chromadb_linux/") MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb COLLECTION_NAME: str = "scheme" EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction( model_name=MODEL_NAME ) schemer = client.get_collection( name=COLLECTION_NAME, embedding_function=EMBEDDING_FUNC, ) return schemer def update_collection(iter: int, text: object, client: chromadb.Collection): client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)]) def encode_image(image) -> str: import io import base64 byte_arr = io.BytesIO() image.save(byte_arr, format="JPEG") encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8") return encoded_image async def image_to_text(image) -> object: from openai import OpenAI import json client = OpenAI() response = client.chat.completions.create( model="gpt-4-turbo", response_format={"type": "json_object"}, messages=[ { "role": "user", "content": [ {"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64;,{image}", "detail": "high", }, }, ], } ], ) return json.loads(response.choices[0].message.content) async def start_troggin_off(dir: str, client): # recursive import os from pdf2image import convert_from_path dirs = os.listdir(dir) for path in dirs: if os.path.isdir(os.path.join(dir, path)): await start_troggin_off(os.path.join(dir, path), client) # recursive call if(os.path.join(dir, path).endswith(".pdf")): images = convert_from_path(os.path.join(dir, path)) for i, image in enumerate(images): encoded_image = encode_image(image) text = await image_to_text(encoded_image) update_collection(i, text, client) if __name__ == "__main__": import asyncio client = create_client() # client = None asyncio.run(start_troggin_off("data/", client))