Carlos Salgado commited on
Commit
685ce53
2 Parent(s): 61dfc10 c49ca87

Merge pull request #32 from eliawaefler/ingest

Browse files
Files changed (4) hide show
  1. .gitignore +2 -2
  2. backend/generate_metadata.py +99 -34
  3. flake.nix +1 -0
  4. ingest.py +7 -0
.gitignore CHANGED
@@ -1,7 +1,7 @@
1
  .envrc
2
  .direnv/
3
- *.lock
4
  .env
5
  .venv
6
  .ipynb_checkpoints
7
-
 
 
1
  .envrc
2
  .direnv/
 
3
  .env
4
  .venv
5
  .ipynb_checkpoints
6
+ flake.nix
7
+ *__pycache__*
backend/generate_metadata.py CHANGED
@@ -1,43 +1,108 @@
1
  import os
 
 
2
  import json
3
  import openai
 
4
  from dotenv import load_dotenv
 
 
 
 
 
 
5
 
6
  from schema import Metadata, BimDiscipline
7
 
8
  load_dotenv()
9
 
10
- with open('school_plumbing.txt', 'r') as f:
11
- #with open('schulgebäudes.txt', 'r') as f:
12
- context = f.readlines()
13
-
14
- # Create client
15
- client = openai.OpenAI(
16
- base_url="https://api.together.xyz/v1",
17
- api_key=os.environ["TOGETHER_API_KEY"],
18
- )
19
-
20
- # Call the LLM with the JSON schema
21
- chat_completion = client.chat.completions.create(
22
- model="mistralai/Mixtral-8x7B-Instruct-v0.1",
23
- response_format={"type": "json_object", "schema": Metadata.model_json_schema()},
24
- messages=[
25
- {
26
- "role": "system",
27
- "content": f"You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
28
- },
29
- {
30
- "role": "user",
31
- "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{context}"
32
- }
33
- ],
34
- )
35
-
36
- created_user = json.loads(chat_completion.choices[0].message.content)
37
- print(json.dumps(created_user, indent=2))
38
-
39
- {
40
- "title": "Plumbing System for a Typical School Building",
41
- "summary": "This document details the plumbing system of a school building, including potable water supply, fixtures and appliances, drainage waste and vent systems, and stormwater management, adhering to ADA compliance, low flow rates, water conservation standards, and local codes and regulations.\n",
42
- "discipline": "S - Sanit\u00e4r"
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+
3
+ import argparse
4
  import json
5
  import openai
6
+
7
  from dotenv import load_dotenv
8
+ from langchain_community.document_loaders import TextLoader
9
+ from langchain_community.document_loaders import UnstructuredPDFLoader
10
+ from langchain_community.embeddings.fake import FakeEmbeddings
11
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+
13
+ from langchain_community.vectorstores import Vectara
14
 
15
  from schema import Metadata, BimDiscipline
16
 
17
  load_dotenv()
18
 
19
+ vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
20
+ vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
21
+ vectara_api_key = os.environ['VECTARA_API_KEY']
22
+
23
+ vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
24
+ vectara_corpus_id=vectara_corpus_id,
25
+ vectara_api_key=vectara_api_key)
26
+
27
+
28
+ def ingest(file_path):
29
+ extension = file_path.split('.')[-1]
30
+ ext = extension.lower()
31
+ if ext == 'pdf':
32
+ loader = UnstructuredPDFLoader(file_path)
33
+ elif ext == 'txt':
34
+ loader = TextLoader(file_path)
35
+ else:
36
+ raise NotImplementedError('Only .txt or .pdf files are supported')
37
+
38
+ # transform locally
39
+ documents = loader.load()
40
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
41
+ separators=[
42
+ "\n\n",
43
+ "\n",
44
+ " ",
45
+ ",",
46
+ "\uff0c", # Fullwidth comma
47
+ "\u3001", # Ideographic comma
48
+ "\uff0e", # Fullwidth full stop
49
+ # "\u200B", # Zero-width space (Asian languages)
50
+ # "\u3002", # Ideographic full stop (Asian languages)
51
+ "",
52
+ ])
53
+ docs = text_splitter.split_documents(documents)
54
+ #print(docs)
55
+
56
+ return docs
57
+
58
+
59
+ # vectara = Vectara.from_documents(docs, embedding=FakeEmbeddings(size=768))
60
+ # retriever = vectara.as_retriever()
61
+
62
+ # return retriever
63
+
64
+
65
+ def extract_metadata(docs):
66
+ # plain text
67
+ context = "".join(
68
+ [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
69
+
70
+ # Create client
71
+ client = openai.OpenAI(
72
+ base_url="https://api.together.xyz/v1",
73
+ api_key=os.environ["TOGETHER_API_KEY"],
74
+ )
75
+
76
+ # Call the LLM with the JSON schema
77
+ chat_completion = client.chat.completions.create(
78
+ model="mistralai/Mixtral-8x7B-Instruct-v0.1",
79
+ response_format={"type": "json_object", "schema": Metadata.model_json_schema()},
80
+ messages=[
81
+ {
82
+ "role": "system",
83
+ "content": f"You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
84
+ },
85
+ {
86
+ "role": "user",
87
+ "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{context}"
88
+ }
89
+ ]
90
+ )
91
+
92
+ created_user = json.loads(chat_completion.choices[0].message.content)
93
+ return created_user
94
+
95
+ if __name__ == "__main__":
96
+ parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
97
+ parser.add_argument("document", metavar="FILEPATH", type=str,
98
+ help="Path to the BIM document")
99
+
100
+ args = parser.parse_args()
101
+
102
+ if not os.path.exists(args.document) or not os.path.isfile(args.document):
103
+ print("File '{}' not found or not accessible.".format(args.document))
104
+ sys.exit(-1)
105
+
106
+ docs = ingest(args.document)
107
+ metadata = extract_metadata(docs)
108
+ print(json.dumps(metadata, indent=2))
flake.nix ADDED
@@ -0,0 +1 @@
 
 
1
+ /home/salgadev/code/dev-flakes/templates/langchain-rag/flake.nix
ingest.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import UnstructuredPDFLoader
2
+
3
+ def ingest_pdf(path):
4
+ loader = UnstructuredPDFLoader()
5
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
6
+
7
+ return data