File size: 3,770 Bytes
be0ac49
cc9e69a
be0ac49
 
a232b2b
be0ac49
cc9e69a
 
 
793ea5f
 
cc9e69a
be0ac49
a232b2b
be0ac49
 
 
cc9e69a
 
 
 
 
 
 
 
5876325
 
 
 
 
 
 
 
 
cc9e69a
 
793ea5f
cc9e69a
 
 
 
 
793ea5f
 
cc9e69a
 
 
793ea5f
 
 
 
 
 
 
 
 
 
 
 
 
cc9e69a
793ea5f
 
 
cc9e69a
 
5876325
793ea5f
 
 
d17ba2d
5876325
 
d17ba2d
 
 
 
 
 
 
 
5876325
d17ba2d
 
 
5876325
d17ba2d
 
 
5876325
d17ba2d
 
 
dbd084e
 
d17ba2d
 
 
 
 
 
 
 
 
 
 
 
 
793ea5f
 
dbd084e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import argparse
import json
import openai
import sys
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings.fake import FakeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Vectara

from backend.schema import Metadata, BimDiscipline

load_dotenv()

vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
vectara_api_key = os.environ['VECTARA_API_KEY']

vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
                      vectara_corpus_id=vectara_corpus_id,
                      vectara_api_key=vectara_api_key)

prompt_template = """
BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture']

You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."

Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:
context="
"""


def ingest(file_path):
    extension = file_path.split('.')[-1]
    ext = extension.lower()
    if ext == 'pdf':
        loader = UnstructuredPDFLoader(file_path)
    elif ext == 'txt':
        loader = TextLoader(file_path)
    else:
        raise NotImplementedError('Only .txt or .pdf files are supported')

    # transform locally
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
    separators=[
        "\n\n",
        "\n",
        " ",
        ",",        
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        # "\u200B",  # Zero-width space (Asian languages)
        # "\u3002",  # Ideographic full stop (Asian languages)
        "",
    ])
    docs = text_splitter.split_documents(documents)

    return docs



def extract_metadata(docs):        
    # plain text     
    context = "".join(
        [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])

    prompt = f'{prompt_template}{context}"'

    # Create client
    client = openai.OpenAI(
        base_url="https://api.together.xyz/v1",
        api_key=os.environ["TOGETHER_API_KEY"],
    )

    # Call the LLM with the JSON schema
    chat_completion = client.chat.completions.create(
        model="mistralai/Mixtral-8x7B-Instruct-v0.1",        
        messages=[
            {
                "role": "system",
                "content": f"You are a helpful assistant that responsds in JSON format"                
            },
            {
                "role": "user",
                "content": prompt                                
            }
        ]
    )
    # returns a dictionary
    return json.loads(chat_completion.choices[0].message.content)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
    parser.add_argument("document", metavar="FILEPATH", type=str,
                        help="Path to the BIM document")

    args = parser.parse_args()

    if not os.path.exists(args.document) or not os.path.isfile(args.document):
        print("File '{}' not found or not accessible.".format(args.document))
        sys.exit(-1)

    docs = ingest(args.document)
    metadata = extract_metadata(docs)
    print(metadata)