Spaces:

AIhackathons
/

docverifyrag

Runtime error

App Files Files Community

Carlos Salgado commited on Apr 15, 2024

Commit

d17ba2d

1 Parent(s): 9689d80

ignore flakes, add ingest

Browse files

Files changed (3) hide show

.gitignore +2 -0
generate_metadata.py +43 -34
ingest.py +7 -0

.gitignore CHANGED Viewed

@@ -5,3 +5,5 @@
 .venv
 .ipynb_checkpoints

 .venv
 .ipynb_checkpoints
+flake.lock
+flake.nix

generate_metadata.py CHANGED Viewed

@@ -2,42 +2,51 @@ import os
 import json
 import openai
 from dotenv import load_dotenv
 from schema import Metadata, BimDiscipline
 load_dotenv()
-with open('school_plumbing.txt', 'r') as f:
-#with open('schulgebäudes.txt', 'r') as f:
-    context = f.readlines()
-# Create client
-client = openai.OpenAI(
-    base_url="https://api.together.xyz/v1",
-    api_key=os.environ["TOGETHER_API_KEY"],
-)
-# Call the LLM with the JSON schema
-chat_completion = client.chat.completions.create(
-    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
-    response_format={"type": "json_object", "schema": Metadata.model_json_schema()},
-    messages=[
-    {
-        "role": "system",
-        "content": f"You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
-    },
-    {
-        "role": "user",
-        "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{context}"
-    }
-],
-)
-created_user = json.loads(chat_completion.choices[0].message.content)
-print(json.dumps(created_user, indent=2))
-{
-  "title": "Plumbing System for a Typical School Building",
-  "summary": "This document details the plumbing system of a school building, including potable water supply, fixtures and appliances, drainage waste and vent systems, and stormwater management, adhering to ADA compliance, low flow rates, water conservation standards, and local codes and regulations.\n",
-  "discipline": "S - Sanit\u00e4r"
-}

 import json
 import openai
 from dotenv import load_dotenv
+import argparse
 from schema import Metadata, BimDiscipline
 load_dotenv()
+def extract_metadata(filename):
+    with open(filename, 'r') as f:
+        context = f.readlines()
+    # Create client
+    client = openai.OpenAI(
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ["TOGETHER_API_KEY"],
+    )
+    # Call the LLM with the JSON schema
+    chat_completion = client.chat.completions.create(
+        model="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        response_format={"type": "json_object", "schema": Metadata.model_json_schema()},
+        messages=[
+            {
+                "role": "system",
+                "content": f"You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
+            },
+            {
+                "role": "user",
+                "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{' '.join(context)}"
+            }
+        ]
+    )
+    created_user = json.loads(chat_completion.choices[0].message.content)
+    return created_user
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
+    parser.add_argument("document", metavar="FILEPATH", type=str,
+                        help="Path to the BIM document")
+    args = parser.parse_args()
+    if not os.path.exists(args.document) or not os.path.isfile(args.document):
+        print("File '{}' not found or not accessible.".format(args.document))
+        sys.exit(-1)
+    metadata = extract_metadata(args.document)
+    print(json.dumps(metadata, indent=2))

ingest.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from langchain_community.document_loaders import UnstructuredPDFLoader
+def ingest_pdf(path):
+    loader = UnstructuredPDFLoader()
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    return data