LordFarquaad42 commited on
Commit
1612f56
1 Parent(s): a25aac3

hacker mode

Browse files
Files changed (2) hide show
  1. add_data.py +35 -21
  2. app.py +10 -2
add_data.py CHANGED
@@ -2,6 +2,19 @@ import chromadb
2
  from chromadb.utils import embedding_functions
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def get_client():
6
  client = chromadb.PersistentClient(path="./chromadb_linux/")
7
  MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
@@ -20,11 +33,14 @@ def update_collection(iter: int, text: object, client: chromadb.Collection):
20
  client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
21
 
22
 
23
- def encode_image(img_path: str):
 
24
  import base64
25
-
26
- with open(img_path, "rb") as image_file:
27
- return base64.b64encode(image_file.read()).decode("utf-8")
 
 
28
 
29
 
30
  async def image_to_text(image) -> object:
@@ -55,28 +71,26 @@ async def image_to_text(image) -> object:
55
  return json.loads(response.choices[0].message.content)
56
 
57
 
58
- async def start_troggin_off(dir: str):
 
59
  import os
60
  from pdf2image import convert_from_path
61
 
62
- client = get_client()
63
-
64
- for folder in os.listdir(dir):
65
- folder_path = os.path.join(dir, folder)
66
- if os.path.isdir(folder_path):
67
- for file in os.listdir(folder_path):
68
- if file.endswith(".pdf"):
69
- print("Processing", file)
70
- pdf_path = os.path.join(folder_path, file)
71
- images = convert_from_path(pdf_path)
72
 
73
- for i, image in enumerate(images):
74
- image.save(f"out{i}.jpg", "JPEG")
75
- encoded_image = encode_image(f"out{i}.jpg")
76
- text = await image_to_text(encoded_image)
77
- update_collection(i, text, client)
78
 
 
 
 
 
79
 
80
  if __name__ == "__main__":
81
  import asyncio
82
- asyncio.run(start_troggin_off("data/Class Notes/"))
 
 
 
2
  from chromadb.utils import embedding_functions
3
 
4
 
5
+ def create_client():
6
+ client = chromadb.PersistentClient(path="./chromadb_linux/")
7
+ MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
8
+ COLLECTION_NAME: str = "schemer2"
9
+ EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
10
+ model_name=MODEL_NAME
11
+ )
12
+ schemer = client.get_collection(
13
+ name=COLLECTION_NAME,
14
+ embedding_function=EMBEDDING_FUNC,
15
+ )
16
+ return schemer
17
+
18
  def get_client():
19
  client = chromadb.PersistentClient(path="./chromadb_linux/")
20
  MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
 
33
  client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
34
 
35
 
36
+ def encode_image(image) -> str:
37
+ import io
38
  import base64
39
+
40
+ byte_arr = io.BytesIO()
41
+ image.save(byte_arr, format="JPEG")
42
+ encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
43
+ return encoded_image
44
 
45
 
46
  async def image_to_text(image) -> object:
 
71
  return json.loads(response.choices[0].message.content)
72
 
73
 
74
+ async def start_troggin_off(dir: str, client):
75
+ # recursive
76
  import os
77
  from pdf2image import convert_from_path
78
 
79
+ dirs = os.listdir(dir)
80
+ for path in dirs:
81
+ if os.path.isdir(os.path.join(dir, path)):
82
+ await start_troggin_off(os.path.join(dir, path), client) # recursive call
 
 
 
 
 
 
83
 
84
+ if(os.path.join(dir, path).endswith(".pdf")):
85
+ images = convert_from_path(os.path.join(dir, path))
 
 
 
86
 
87
+ for i, image in enumerate(images):
88
+ encoded_image = encode_image(image)
89
+ text = await image_to_text(encoded_image)
90
+ update_collection(i, text, client)
91
 
92
  if __name__ == "__main__":
93
  import asyncio
94
+ client = create_client()
95
+ # client = None
96
+ asyncio.run(start_troggin_off("data/", client))
app.py CHANGED
@@ -2,8 +2,10 @@ import streamlit as st
2
  from openai import OpenAI
3
  from params import params
4
  from database import get_client
 
5
 
6
- CLIENT = get_client()
 
7
  APP_NAME: str = "Groove-GPT"
8
  history = []
9
  st.set_page_config(layout="wide")
@@ -11,6 +13,12 @@ st.set_page_config(layout="wide")
11
  # INFO
12
  st.title(APP_NAME)
13
 
 
 
 
 
 
 
14
  l_col, r_col = st.columns((3, 1))
15
 
16
  # param column
@@ -41,7 +49,7 @@ with l_col:
41
  )
42
  documents = results["documents"]
43
  response = openai_client.chat.completions.create(
44
- model="gpt-3.5-turbo",
45
  messages=[
46
  {
47
  "role": "system",
 
2
  from openai import OpenAI
3
  from params import params
4
  from database import get_client
5
+ from add_data import start_troggin_off, create_client
6
 
7
+ # CLIENT = get_client()
8
+ CLIENT = None
9
  APP_NAME: str = "Groove-GPT"
10
  history = []
11
  st.set_page_config(layout="wide")
 
13
  # INFO
14
  st.title(APP_NAME)
15
 
16
+
17
+ start_embedding = st.button("Hacker man")
18
+ if start_embedding:
19
+ CLIENT = create_client()
20
+ start_troggin_off("./data", CLIENT)
21
+
22
  l_col, r_col = st.columns((3, 1))
23
 
24
  # param column
 
49
  )
50
  documents = results["documents"]
51
  response = openai_client.chat.completions.create(
52
+ model=gpt_type,
53
  messages=[
54
  {
55
  "role": "system",