axjh03 commited on
Commit
af4aa80
1 Parent(s): 658f5f3

vector stores

Browse files
Files changed (2) hide show
  1. Dockerfile +4 -1
  2. ingest.py +6 -14
Dockerfile CHANGED
@@ -31,8 +31,11 @@ RUN mkdir -p /app/.files && \
31
  # Copy the application code into the container
32
  COPY --chown=user . /app
33
 
 
 
 
34
  # Run the application
35
  CMD /bin/bash -c "source env/bin/activate && \
36
  python3 downloadLLM.py && \
37
  python3 ingest.py && \
38
- chainlit run main.py --host 0.0.0.0 --port 7680"
 
31
  # Copy the application code into the container
32
  COPY --chown=user . /app
33
 
34
+ # Switch to the non-root user
35
+ USER user
36
+
37
  # Run the application
38
  CMD /bin/bash -c "source env/bin/activate && \
39
  python3 downloadLLM.py && \
40
  python3 ingest.py && \
41
+ gunicorn -b 0.0.0.0:7860 main:app"
ingest.py CHANGED
@@ -1,27 +1,19 @@
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
- from langchain.document_loaders import PyPDFLoader, DirectoryLoader # could have done any unstructured text loader like ppt and xlsx
3
-
4
-
5
- from langchain.embeddings import HuggingFaceBgeEmbeddings # we can replace huggingface with facetransformers
6
-
7
  from langchain.vectorstores import FAISS
8
 
9
- DATA_PATH = "$HOME/data/"
10
- DB_FAISS_PATH = "$HOME/vectorstores/db_faiss"
11
 
12
- #create vector database
13
  def create_vector_db():
14
- # WE can change .pdf with any other unstructured text format
15
- loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls = PyPDFLoader)
16
  documents = loader.load()
17
 
18
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
19
-
20
  texts = text_splitter.split_documents(documents)
21
 
22
- embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}) # change to GPU if you want
23
-
24
- # cuda is not supported in my MAC M1! SADLY.
25
 
26
  db = FAISS.from_documents(texts, embeddings)
27
  db.save_local(DB_FAISS_PATH)
 
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
3
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
 
 
 
4
  from langchain.vectorstores import FAISS
5
 
6
+ DATA_PATH = "/home/user/data"
7
+ DB_FAISS_PATH = "/home/user/vectorstores/db_faiss"
8
 
 
9
  def create_vector_db():
10
+ loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
 
11
  documents = loader.load()
12
 
13
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
 
14
  texts = text_splitter.split_documents(documents)
15
 
16
+ embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
 
 
17
 
18
  db = FAISS.from_documents(texts, embeddings)
19
  db.save_local(DB_FAISS_PATH)