Spaces:
Sleeping
Sleeping
File size: 3,745 Bytes
be0ac49 cc9e69a be0ac49 cc9e69a be0ac49 cc9e69a 793ea5f cc9e69a be0ac49 cc9e69a 793ea5f cc9e69a 793ea5f cc9e69a 793ea5f cc9e69a 793ea5f cc9e69a 793ea5f cc9e69a 793ea5f cc9e69a 793ea5f d17ba2d 793ea5f d17ba2d 793ea5f d17ba2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import os
import argparse
import json
import openai
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings.fake import FakeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Vectara
from schema import Metadata, BimDiscipline
load_dotenv()
vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
vectara_api_key = os.environ['VECTARA_API_KEY']
vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
vectara_corpus_id=vectara_corpus_id,
vectara_api_key=vectara_api_key)
def ingest(file_path):
extension = file_path.split('.')[-1]
ext = extension.lower()
if ext == 'pdf':
loader = UnstructuredPDFLoader(file_path)
elif ext == 'txt':
loader = TextLoader(file_path)
else:
raise NotImplementedError('Only .txt or .pdf files are supported')
# transform locally
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
separators=[
"\n\n",
"\n",
" ",
",",
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
# "\u200B", # Zero-width space (Asian languages)
# "\u3002", # Ideographic full stop (Asian languages)
"",
])
docs = text_splitter.split_documents(documents)
#print(docs)
return docs
# vectara = Vectara.from_documents(docs, embedding=FakeEmbeddings(size=768))
# retriever = vectara.as_retriever()
# return retriever
def extract_metadata(docs):
# plain text
context = "".join(
[doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
# Create client
client = openai.OpenAI(
base_url="https://api.together.xyz/v1",
api_key=os.environ["TOGETHER_API_KEY"],
)
# Call the LLM with the JSON schema
chat_completion = client.chat.completions.create(
model="mistralai/Mixtral-8x7B-Instruct-v0.1",
response_format={"type": "json_object", "schema": Metadata.model_json_schema()},
messages=[
{
"role": "system",
"content": f"You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
},
{
"role": "user",
"content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{context}"
}
]
)
created_user = json.loads(chat_completion.choices[0].message.content)
return created_user
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
parser.add_argument("document", metavar="FILEPATH", type=str,
help="Path to the BIM document")
args = parser.parse_args()
if not os.path.exists(args.document) or not os.path.isfile(args.document):
print("File '{}' not found or not accessible.".format(args.document))
sys.exit(-1)
docs = ingest(args.document)
metadata = extract_metadata(docs)
print(json.dumps(metadata, indent=2)) |