Carlos Salgado commited on
Commit
cc9e69a
1 Parent(s): d665e88

add vectara ingest functionality

Browse files
Files changed (2) hide show
  1. backend/generate_metadata.py +35 -1
  2. backend/ingest.py +0 -7
backend/generate_metadata.py CHANGED
@@ -1,13 +1,47 @@
1
  import os
 
 
2
  import json
3
  import openai
 
4
  from dotenv import load_dotenv
5
- import argparse
 
 
 
6
 
7
  from schema import Metadata, BimDiscipline
8
 
9
  load_dotenv()
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def extract_metadata(filename):
12
  with open(filename, 'r') as f:
13
  context = f.readlines()
 
1
  import os
2
+
3
+ import argparse
4
  import json
5
  import openai
6
+
7
  from dotenv import load_dotenv
8
+ from langchain_community.document_loaders import TextLoader
9
+ from langchain_community.document_loaders import UnstructuredPDFLoader
10
+ from langchain_community.embeddings.fake import FakeEmbeddings
11
+ from langchain_community.vectorstores import Vectara
12
 
13
  from schema import Metadata, BimDiscipline
14
 
15
  load_dotenv()
16
 
17
+ vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
18
+ vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
19
+ vectara_api_key = os.environ['VECTARA_API_KEY']
20
+
21
+ vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
22
+ vectara_corpus_id=vectara_corpus_id,
23
+ vectara_api_key=vectara_api_key)
24
+
25
+
26
+ def ingest(file_path):
27
+ extension = filepath.split('.')[-1]
28
+ ext = extension.lower()
29
+ if ext == 'pdf':
30
+ loader = UnstructuredPDFLoader(file_path)
31
+ elif ext == 'txt':
32
+ loader = TextLoader(file_path)
33
+
34
+ # transform locally
35
+ documents = loader.load()
36
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
37
+ docs = text_splitter.split_documents(documents)
38
+
39
+ vectara = Vectara.from_documents(docs, embedding=FakeEmbeddings(size=768))
40
+ retriever = vectara.as_retriever()
41
+
42
+ return retriever
43
+
44
+
45
  def extract_metadata(filename):
46
  with open(filename, 'r') as f:
47
  context = f.readlines()
backend/ingest.py DELETED
@@ -1,7 +0,0 @@
1
- from langchain_community.document_loaders import UnstructuredPDFLoader
2
-
3
- def ingest_pdf(path):
4
- loader = UnstructuredPDFLoader()
5
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
6
-
7
- return data