dsmueller commited on
Commit
7206754
1 Parent(s): 0887c4f

Removed committed files, added git copies, modified poetry

Browse files
Dockerfile CHANGED
@@ -6,6 +6,11 @@ FROM python:3.11.5-bookworm
6
  RUN useradd -m -u 1000 user
7
  USER user
8
 
 
 
 
 
 
9
  # Set home to the user's home directory
10
  ENV HOME=/home/user \
11
  PATH=/home/user/.local/bin:$PATH
@@ -18,7 +23,7 @@ WORKDIR $HOME
18
  RUN pip3 install poetry==1.7.1
19
 
20
  # Copy poetry files
21
- COPY --chown=user pyproject.toml poetry.lock* $HOME
22
 
23
  # Disable virtual environments creation by Poetry
24
  # as the Docker container itself is an isolated environment
@@ -34,7 +39,9 @@ ENV PATH="$HOME/.venv/bin:$PATH"
34
  RUN poetry install --no-dev
35
 
36
  # Copy the rest of your application code
37
- COPY --chown=user . $HOME
 
 
38
 
39
  # Expose the port Streamlit runs on
40
  EXPOSE 8501
@@ -43,10 +50,11 @@ EXPOSE 8501
43
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
44
 
45
  # Update working directory to be consistent with where Start.py is
46
- WORKDIR $HOME/scripts
47
 
48
  # An ENTRYPOINT allows you to configure a container that will run as an executable. Here, it also contains the entire streamlit run command for your app, so you don’t have to call it from the command line
49
  ENTRYPOINT ["streamlit", "run", "Start.py", "--server.port=8501", "--server.address=0.0.0.0"]
50
 
 
51
  # docker run -it -p 7860:7860 --platform=linux/amd64 \
52
  # registry.hf.space/ai-aerospace-aerospace-chatbots:latest
 
6
  RUN useradd -m -u 1000 user
7
  USER user
8
 
9
+ # Clone aerospace-chatbot github repository
10
+ RUN apt-get update && apt-get install -y git
11
+ WORKDIR /app
12
+ RUN git clone -b rag_study https://github.com/dan-s-mueller/aerospace_chatbot.git .
13
+
14
  # Set home to the user's home directory
15
  ENV HOME=/home/user \
16
  PATH=/home/user/.local/bin:$PATH
 
23
  RUN pip3 install poetry==1.7.1
24
 
25
  # Copy poetry files
26
+ COPY --chown=user /app/aerospace_chatbot/pyproject.toml /app/aerospace_chatbot/poetry.lock* $HOME
27
 
28
  # Disable virtual environments creation by Poetry
29
  # as the Docker container itself is an isolated environment
 
39
  RUN poetry install --no-dev
40
 
41
  # Copy the rest of your application code
42
+ COPY --chown=user /app/aerospace_chatbot/src $HOME/src
43
+ COPY --chown=user /app/aerospace_chatbot/data $HOME/data
44
+ COPY --chown=user /app/aerospace_chatbot/config $HOME/config
45
 
46
  # Expose the port Streamlit runs on
47
  EXPOSE 8501
 
50
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
51
 
52
  # Update working directory to be consistent with where Start.py is
53
+ WORKDIR $HOME/src
54
 
55
  # An ENTRYPOINT allows you to configure a container that will run as an executable. Here, it also contains the entire streamlit run command for your app, so you don’t have to call it from the command line
56
  ENTRYPOINT ["streamlit", "run", "Start.py", "--server.port=8501", "--server.address=0.0.0.0"]
57
 
58
+ # To run remotely
59
  # docker run -it -p 7860:7860 --platform=linux/amd64 \
60
  # registry.hf.space/ai-aerospace-aerospace-chatbots:latest
config/config.json DELETED
@@ -1,44 +0,0 @@
1
- {
2
- "databases": [
3
- {
4
- "name": "Pinecone",
5
- "embedding_models": ["Openai", "Voyage"]
6
- },
7
- {
8
- "name": "ChromaDB",
9
- "embedding_models": ["Openai"]
10
- },
11
- {
12
- "name": "RAGatouille",
13
- "hf_rag_models": [
14
- "colbert-ir/colbertv2.0"
15
- ]
16
- }
17
- ],
18
- "llms": [
19
- {
20
- "name": "OpenAI",
21
- "models": [
22
- "gpt-3.5-turbo-1106",
23
- "gpt-3.5-turbo-instruct",
24
- "gpt-4",
25
- "gpt-4-32k",
26
- "gpt-4-1106-preview"
27
- ]
28
- },
29
- {
30
- "name": "Hugging Face",
31
- "models": [
32
- "mistralai/Mixtral-8x7B-Instruct-v0.1",
33
- "ai-aerospace/autotrain-ams_v0.1_100_Mistral-7B-Instruct-v0.1",
34
- "meta-llama/Llama-2-7b-chat-hf"
35
- ]
36
- }
37
- ],
38
- "rag_types": [
39
- "Standard",
40
- "Parent-Child",
41
- "Hypothetical Questions",
42
- "Summaries"
43
- ]
44
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/index_data.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "Pinecone": {
3
- "Openai": "pinecone-openai-ams",
4
- "Voyage": "pinecone-voyage-ams"
5
- },
6
- "ChromaDB": {
7
- "Openai": "chromadb-openai-ams",
8
- "Voyage": "chromadb-voyage-ams"
9
- },
10
- "RAGatouille": {
11
- "colbert-ir/colbertv2.0": "RAGatouille-colbertv2.0-ams"
12
- }
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/AMS/AMS_1996.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3626fd4a0769b8a73a12ee79a1bec7c264c541a5bf90df6f6c13c1ff00011b24
3
- size 152158068
 
 
 
 
data/AMS/AMS_1997.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:34442bbc794415ea8d778ebd57e1dd368e20c5e6f65aff35fa008af54dbb900a
3
- size 22719877
 
 
 
 
data/AMS/AMS_1998.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1523ca03cd1254b81cd0cb285182b7ac40208cba7932972ca00e0942e43f3539
3
- size 122280718
 
 
 
 
data/AMS/AMS_1999.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c631364761565d749e6bafb0ab1e84611e773ccdb640ab08f6b32b1fcc49e1e
3
- size 27850919
 
 
 
 
data/AMS/AMS_2000.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddf89c5cd9ddbd225e77198b19274535d4f003fdc20b5823239f51ad48230549
3
- size 24061146
 
 
 
 
data/AMS/AMS_2001.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c63b2bba5a892759a7298097ee2388f353cc974285a73bfd8635d48af9f7d945
3
- size 23264984
 
 
 
 
data/AMS/AMS_2002.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b8b60c30ea9843face46e021a80bd1072901596b8e0f98a63601b31ecac2076
3
- size 41615570
 
 
 
 
data/AMS/AMS_2004.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:986a7f046ba336d35d9d0db974931940543d612dad2c9bb6d5976d778777b659
3
- size 28914300
 
 
 
 
data/AMS/AMS_2006.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:af4fb8e67c1ebf7b51fddd947d531d68ab05ff187fe915528811676ae0083d55
3
- size 61039456
 
 
 
 
data/AMS/AMS_2008.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d74dcd8ef68ae324f9246a35e1ccf538c4fd676d8b1ae733191c8ad6a055c90
3
- size 31961158
 
 
 
 
data/AMS/AMS_2010.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:30d9ad0b75d0d41c75926dd97361f1548b79920df61d8d7486978d4b69a00ef6
3
- size 30161812
 
 
 
 
data/AMS/AMS_2012.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e26a981f74c9d0c3526ad5152c83ad9fabde8f197f69cb24a0fd1d4004c1f026
3
- size 31088140
 
 
 
 
data/AMS/AMS_2014.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:73dea6c8c45d0103404e3e3bd764e6efcd0f5bf5f45d505ce98e6c07528d9322
3
- size 35199422
 
 
 
 
data/AMS/AMS_2016.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7d8a0e558abd59b94abcbe013f263755f3c525eaf73702662293a3d8b5e2ec5
3
- size 35244294
 
 
 
 
data/AMS/AMS_2018.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b929f0c6d71116e23d4f52011e82eda07280aabb177300e37419ca38b047c60
3
- size 30251124
 
 
 
 
data/AMS/AMS_2020.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb6aaaa2cb700bc7d460a1f222756e6a795b629780087a477acd9713982fc0b9
3
- size 45426669
 
 
 
 
data/AMS/AMS_2022.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccc90819f501fca9445d415c1ceca8d3991300f8e08724cf7043f1a103aa4231
3
- size 17636761
 
 
 
 
data/AMS/README.txt DELETED
@@ -1,18 +0,0 @@
1
- Documents are not uploaded to git. The list of documents which were uploaded to pinecone database AMS:
2
- AMS_1996, https://ntrs.nasa.gov/citations/19960025595
3
- AMS_1997, https://ntrs.nasa.gov/citations/19970021613
4
- AMS_1998, https://ntrs.nasa.gov/citations/19980193156
5
- AMS_1999, https://ntrs.nasa.gov/citations/19990053852
6
- AMS_2000, https://ntrs.nasa.gov/citations/20000048380
7
- AMS_2001, https://ntrs.nasa.gov/citations/20010071164
8
- AMS_2002, https://ntrs.nasa.gov/citations/20020050182
9
- AMS_2004, https://ntrs.nasa.gov/citations/20040084272
10
- AMS_2006, https://ntrs.nasa.gov/citations/20060028221
11
- AMS_2008, https://ntrs.nasa.gov/citations/20080023060
12
- AMS_2010, https://ntrs.nasa.gov/citations/20100021914
13
- AMS_2012, https://ntrs.nasa.gov/citations/20130008824
14
- AMS_2014, https://ntrs.nasa.gov/citations/20140008875
15
- AMS_2016, https://ntrs.nasa.gov/citations/20160004038
16
- AMS_2018, https://ntrs.nasa.gov/citations/20180002828
17
- AMS_2020, https://ntrs.nasa.gov/citations/20205009766
18
- AMS_2022, https://ntrs.nasa.gov/citations/20220006415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/AMS/ams_data-400-0-50.json DELETED
The diff for this file is too large to render. See raw diff
 
data/AMS/ams_data-400-0-all.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:923f4efbb6bcfa932cad87520177cb65bcf4b3df7fbc7446285df7ef070fa3eb
3
- size 36094453
 
 
 
 
data/AMS/ams_data-400-0.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef248f60645d1def4d3624351c90cbb5d91554d0a8bfd35615514f4a71a20159
3
- size 18183603
 
 
 
 
data/AMS/ams_data-5000-0.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0472930c89ad2c13f997789b070049c99640c6ddcd114cc635110409854435b5
3
- size 17283048
 
 
 
 
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -5,12 +5,11 @@ description = ""
5
  authors = ["dsmueller <dsm@danmueller.pro>"]
6
 
7
  [tool.poetry.dependencies]
8
- python = "^3.11"
9
  python-dotenv = "^1.0.0"
10
  ipykernel = "^6.28.0"
11
  ipywidgets = "^8.1.1"
12
  langchainhub = "^0.1.14"
13
- pinecone-client = "^2.2.4"
14
  tiktoken = "^0.5.2"
15
  watchdog = "^3.0.0"
16
  chromadb = "^0.4.22"
@@ -25,8 +24,9 @@ langchain-openai = "^0.0.2.post1"
25
  sentence-transformers = "^2.2.2"
26
  ragatouille = "^0.0.4b2"
27
  nbformat = "^5.9.2"
28
- ragxplorer = {git = "https://github.com/dsmueller3760/RAGxplorer.git", rev = "load_db"}
29
  pydantic = "^2.6.0"
 
 
30
 
31
 
32
  [build-system]
 
5
  authors = ["dsmueller <dsm@danmueller.pro>"]
6
 
7
  [tool.poetry.dependencies]
8
+ python = ">=3.11,<3.13"
9
  python-dotenv = "^1.0.0"
10
  ipykernel = "^6.28.0"
11
  ipywidgets = "^8.1.1"
12
  langchainhub = "^0.1.14"
 
13
  tiktoken = "^0.5.2"
14
  watchdog = "^3.0.0"
15
  chromadb = "^0.4.22"
 
24
  sentence-transformers = "^2.2.2"
25
  ragatouille = "^0.0.4b2"
26
  nbformat = "^5.9.2"
 
27
  pydantic = "^2.6.0"
28
+ RAGxplorer = { git = "https://github.com/dan-s-mueller/RAGxplorer.git", branch = "load_options" }
29
+ pinecone-client = "^3.0.2"
30
 
31
 
32
  [build-system]
scripts/Start.py DELETED
@@ -1,41 +0,0 @@
1
- import streamlit as st
2
- import os
3
-
4
- # Set up page
5
- st.set_page_config(
6
- page_title="Aerospace Chatbot: AMS",
7
- )
8
- st.title("Aerospace Chatbot Homepage")
9
- st.markdown("Code base: https://github.com/dsmueller3760/aerospace_chatbot/tree/rag_study")
10
- st.markdown('---')
11
- st.markdown("""
12
- This space contains chatbots and tools for exploring data in the aerospace mechanisms symposia, using all available papers published since 2000.
13
- """)
14
- st.subheader("Running Locally")
15
- '''
16
- It is recommended to run this streamlit app locally for improved performance. The hosted hugging face version is for proof of concept.
17
- You must have poetry installed locally to manage depdenencies. To run locally, clone the repository and run the following commands.
18
-
19
- poetry config virtualenvs.in-project true
20
- poetry install
21
- source .venv/bin/activate
22
- cd ./scripts
23
- streamlit run Start.py
24
- '''
25
-
26
- st.subheader("Aerospace Mechanisms Symposia (AMS)")
27
- '''
28
- This chatbot will look up from all Aerospace Mechanism Symposia in the following location: https://github.com/dsmueller3760/aerospace_chatbot/tree/main/data/AMS
29
- * Available models: https://platform.openai.com/docs/models
30
- * Model parameters: https://platform.openai.com/docs/api-reference/chat/create
31
- * Pinecone: https://docs.pinecone.io/docs/projects#api-keys
32
- * OpenAI API: https://platform.openai.com/api-keys
33
- '''
34
-
35
- st.subheader("API Key Links")
36
- '''
37
- * OpenAI: https://platform.openai.com/api-keys
38
- * Pinecone: https://www.pinecone.io
39
- * Hugging Face: https://huggingface.co/settings/tokens
40
- * Voyage: https://dash.voyageai.com/api-keys
41
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/data_import.py DELETED
@@ -1,278 +0,0 @@
1
- import os
2
- import re
3
- import logging
4
- import shutil
5
- import string
6
-
7
- import pinecone
8
- import chromadb
9
-
10
- import json, jsonlines
11
- from tqdm import tqdm
12
-
13
- from langchain_community.vectorstores import Pinecone
14
- from langchain_community.vectorstores import Chroma
15
-
16
- from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
17
-
18
- from langchain_openai import OpenAIEmbeddings
19
- from langchain_community.embeddings import VoyageEmbeddings
20
-
21
- from langchain_community.document_loaders import PyPDFLoader
22
- from langchain_core.documents import Document as lancghain_Document
23
-
24
- from ragatouille import RAGPretrainedModel
25
-
26
- from dotenv import load_dotenv,find_dotenv
27
- load_dotenv(find_dotenv(),override=True)
28
-
29
- # Set secrets from environment file
30
- OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
31
- VOYAGE_API_KEY=os.getenv('VOYAGE_API_KEY')
32
- PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
33
- HUGGINGFACEHUB_API_TOKEN=os.getenv('HUGGINGFACEHUB_API_TOKEN')
34
-
35
- def chunk_docs(docs,
36
- chunk_method='tiktoken_recursive',
37
- file=None,
38
- chunk_size=500,
39
- chunk_overlap=0,
40
- use_json=False):
41
- docs_out=[]
42
- if file:
43
- logging.info('Jsonl file to be used: '+file)
44
- if use_json and os.path.exists(file):
45
- logging.info('Jsonl file found, using this instead of parsing docs.')
46
- with open(file, "r") as file_in:
47
- file_data = [json.loads(line) for line in file_in]
48
- # Process the file data and put it into the same format as docs_out
49
- for line in file_data:
50
- doc_temp = lancghain_Document(page_content=line['page_content'],
51
- source=line['metadata']['source'],
52
- page=line['metadata']['page'],
53
- metadata=line['metadata'])
54
- if has_meaningful_content(doc_temp):
55
- docs_out.append(doc_temp)
56
- logging.info('Parsed: '+file)
57
- logging.info('Number of entries: '+str(len(docs_out)))
58
- logging.info('Sample entries:')
59
- logging.info(str(docs_out[0]))
60
- logging.info(str(docs_out[-1]))
61
- else:
62
- logging.info('No jsonl found. Reading and parsing docs.')
63
- logging.info('Chunk size (tokens): '+str(chunk_size))
64
- logging.info('Chunk overlap (tokens): '+str(chunk_overlap))
65
- for doc in tqdm(docs,desc='Reading and parsing docs'):
66
- logging.info('Parsing: '+doc)
67
- loader = PyPDFLoader(doc)
68
- data = loader.load_and_split()
69
-
70
- if chunk_method=='tiktoken_recursive':
71
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
72
- else:
73
- raise NotImplementedError
74
- pages = text_splitter.split_documents(data)
75
-
76
- # Tidy up text by removing unnecessary characters
77
- for page in pages:
78
- page.metadata['source']=os.path.basename(page.metadata['source']) # Strip path
79
- page.metadata['page']=int(page.metadata['page'])+1 # Pages are 0 based, update
80
- page.page_content=re.sub(r"(\w+)-\n(\w+)", r"\1\2", page.page_content) # Merge hyphenated words
81
- page.page_content = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", page.page_content.strip()) # Fix newlines in the middle of sentences
82
- page.page_content = re.sub(r"\n\s*\n", "\n\n", page.page_content) # Remove multiple newlines
83
- # Add metadata to the end of the page content, some RAG models don't have metadata.
84
- page.page_content += str(page.metadata)
85
- doc_temp=lancghain_Document(page_content=page.page_content,
86
- source=page.metadata['source'],
87
- page=page.metadata['page'],
88
- metadata=page.metadata)
89
- if has_meaningful_content(page):
90
- docs_out.append(doc_temp)
91
- logging.info('Parsed: '+doc)
92
- logging.info('Sample entries:')
93
- logging.info(str(docs_out[0]))
94
- logging.info(str(docs_out[-1]))
95
- if file:
96
- # Write to a jsonl file, save it.
97
- logging.info('Writing to jsonl file: '+file)
98
- with jsonlines.open(file, mode='w') as writer:
99
- for doc in docs_out:
100
- writer.write(doc.dict())
101
- logging.info('Written: '+file)
102
- return docs_out
103
- def load_docs(index_type,
104
- docs,
105
- query_model,
106
- index_name=None,
107
- chunk_method='tiktoken_recursive',
108
- chunk_size=500,
109
- chunk_overlap=0,
110
- clear=False,
111
- use_json=False,
112
- file=None,
113
- batch_size=50,
114
- local_db_path='../db'):
115
- """
116
- Loads PDF documents. If index_name is blank, it will return a list of the data (texts). If it is a name of a pinecone storage, it will return the vector_store.
117
- """
118
- # Chunk docs
119
- docs_out=chunk_docs(docs,
120
- chunk_method=chunk_method,
121
- file=file,
122
- chunk_size=chunk_size,
123
- chunk_overlap=chunk_overlap,
124
- use_json=use_json)
125
- # Initialize client
126
- if index_name:
127
- if index_type=="Pinecone":
128
- # Import and initialize Pinecone client
129
- pinecone.init(
130
- api_key=PINECONE_API_KEY
131
- )
132
- # Find the existing index, clear for new start
133
- if clear:
134
- try:
135
- pinecone.describe_index(index_name)
136
- except:
137
- raise Exception(f"Cannot clear index {index_name} because it does not exist.")
138
- index=pinecone.Index(index_name)
139
- index.delete(delete_all=True) # Clear the index first, then upload
140
- logging.info('Cleared database '+index_name)
141
- # Upsert docs
142
- try:
143
- pinecone.describe_index(index_name)
144
- except:
145
- logging.info(f"Index {index_name} does not exist. Creating new index.")
146
- logging.info('Size of embedding used: '+str(embedding_size(query_model))) # TODO: set this to be backed out of the embedding size
147
- pinecone.create_index(index_name,dimension=embedding_size(query_model))
148
- logging.info(f"Index {index_name} created. Adding {len(docs_out)} entries to index.")
149
- pass
150
- else:
151
- logging.info(f"Index {index_name} exists. Adding {len(docs_out)} entries to index.")
152
- index = pinecone.Index(index_name)
153
- vectorstore = Pinecone(index, query_model, "page_content") # Set the vector store to calculate embeddings on page_content
154
- vectorstore = batch_upsert(index_type,
155
- vectorstore,
156
- docs_out,
157
- batch_size=batch_size)
158
- elif index_type=="ChromaDB":
159
- # Upsert docs. Defaults to putting this in the local_db_path directory
160
- logging.info(f"Creating new index {index_name}.")
161
- persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
162
- vectorstore = Chroma(client=persistent_client,
163
- collection_name=index_name,
164
- embedding_function=query_model)
165
- logging.info(f"Index {index_name} created. Adding {len(docs_out)} entries to index.")
166
- vectorstore = batch_upsert(index_type,
167
- vectorstore,
168
- docs_out,
169
- batch_size=batch_size)
170
- logging.info("Documents upserted to f{index_name}.")
171
- # Test query
172
- test_query = vectorstore.similarity_search('What are examples of aerosapce adhesives to avoid?')
173
- logging.info('Test query: '+str(test_query))
174
- if not test_query:
175
- raise ValueError("Chroma vector database is not configured properly. Test query failed.")
176
- elif index_type=="RAGatouille":
177
- logging.info(f'Setting up RAGatouille model {query_model}')
178
- vectorstore = RAGPretrainedModel.from_pretrained(query_model)
179
- logging.info('RAGatouille model set: '+str(vectorstore))
180
-
181
- # Create an index from the vectorstore.
182
- docs_out_colbert = [doc.page_content for doc in docs_out]
183
- if chunk_size>500:
184
- raise ValueError("RAGatouille cannot handle chunks larger than 500 tokens. Reduce token count.")
185
- vectorstore.index(
186
- collection=docs_out_colbert,
187
- index_name=index_name,
188
- max_document_length=chunk_size,
189
- overwrite_index=True,
190
- split_documents=True,
191
- )
192
- logging.info(f"Index created: {vectorstore}")
193
-
194
- # Move the directory to the db folder
195
- logging.info(f"Moving RAGatouille index to {local_db_path}")
196
- ragatouille_path = os.path.join(local_db_path, '.ragatouille')
197
- if os.path.exists(ragatouille_path):
198
- shutil.rmtree(ragatouille_path)
199
- logging.info(f"RAGatouille index deleted from {ragatouille_path}")
200
- shutil.move('./.ragatouille', local_db_path)
201
- logging.info(f"RAGatouille index created in {local_db_path}:"+str(vectorstore))
202
-
203
- # Return vectorstore or docs
204
- if index_name:
205
- return vectorstore
206
- else:
207
- return docs_out
208
- def delete_index(index_type,index_name,
209
- local_db_path='../db'):
210
- """
211
- Deletes an existing Pinecone index with the given index_name.
212
- """
213
- if index_type=="Pinecone":
214
- # Import and initialize Pinecone client
215
- pinecone.init(
216
- api_key=PINECONE_API_KEY
217
- )
218
- try:
219
- pinecone.describe_index(index_name)
220
- logging.info(f"Index {index_name} exists.")
221
- except:
222
- raise Exception(f"Index {index_name} does not exist, cannot delete.")
223
- else:
224
- pinecone.delete_index(index_name)
225
- logging.info(f"Index {index_name} deleted.")
226
- elif index_type=="ChromaDB":
227
- # Delete existing collection
228
- logging.info(f"Deleting index {index_name}.")
229
- persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
230
- persistent_client.delete_collection(name=index_name)
231
- logging.info("Index deleted.")
232
- elif index_type=="RAGatouille":
233
- raise NotImplementedError
234
- def batch_upsert(index_type,vectorstore,docs_out,batch_size=50):
235
- # Batch insert the chunks into the vector store
236
- for i in range(0, len(docs_out), batch_size):
237
- chunk_batch = docs_out[i:i + batch_size]
238
- if index_type=="Pinecone":
239
- vectorstore.add_documents(chunk_batch)
240
- elif index_type=="ChromaDB":
241
- vectorstore.add_documents(chunk_batch) # Happens to be same for chroma/pinecone, leaving if statement just in case
242
- return vectorstore
243
- def has_meaningful_content(page):
244
- """
245
- Test whether the page has more than 30% words and is more than 5 words.
246
- """
247
- text=page.page_content
248
- num_words = len(text.split())
249
- alphanumeric_pct = sum(c.isalnum() for c in text) / len(text)
250
- if num_words < 5 or alphanumeric_pct < 0.3:
251
- return False
252
- else:
253
- return True
254
- def embedding_size(embedding_model):
255
- """
256
- Returns the embedding size of the model.
257
- """
258
- if isinstance(embedding_model,OpenAIEmbeddings):
259
- return 1536 # https://platform.openai.com/docs/models/embeddings, test-embedding-ada-002
260
- elif isinstance(embedding_model,VoyageEmbeddings):
261
- return 1024 # https://docs.voyageai.com/embeddings/, voyage-02
262
- else:
263
- raise NotImplementedError
264
- def process_chunk(json_file,llm,
265
- clean_data=False,tag_data=False,question_data=False):
266
- docs_out=[]
267
- with open(json_file, "r") as file_in:
268
- file_data = [json.loads(line) for line in file_in]
269
- # Process the file data and put it into the same format as docs_out
270
- for line in file_data:
271
- doc_temp = lancghain_Document(page_content=line['page_content'],
272
- source=line['metadata']['source'],
273
- page=line['metadata']['page'],
274
- metadata=line['metadata'])
275
- docs_out.append(doc_temp)
276
- # clean data
277
- # tag data
278
- # question data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/pages/1_Chatbot_AMS_Modular.py DELETED
@@ -1,160 +0,0 @@
1
- import queries, setup
2
-
3
- import os
4
- import time
5
- import logging
6
- import json
7
-
8
- import pinecone
9
- import openai
10
-
11
- from langchain_community.vectorstores import Pinecone
12
- from langchain_community.vectorstores import Chroma
13
-
14
- from langchain_openai import OpenAIEmbeddings
15
- from langchain_community.embeddings import VoyageEmbeddings
16
-
17
- from langchain_openai import OpenAI, ChatOpenAI
18
- from langchain_community.llms import HuggingFaceHub
19
-
20
- from ragatouille import RAGPretrainedModel
21
-
22
- import streamlit as st
23
-
24
- # Set up the page, enable logging, read environment variables
25
- from dotenv import load_dotenv,find_dotenv
26
- load_dotenv(find_dotenv(),override=True)
27
- logging.basicConfig(filename='app_1_chatbot_ams_modular.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
28
-
29
- # Set the page title
30
- st.set_page_config(
31
- page_title='Aerospace Chatbot: Modular',
32
- layout='wide'
33
- )
34
- st.title('Aerospace Mechanisms Chatbot')
35
- with st.expander('''What's under the hood?'''):
36
- st.markdown('''
37
- This chatbot will look up from all Aerospace Mechanism Symposia in the following location: https://github.com/dsmueller3760/aerospace_chatbot/tree/main/data/AMS
38
- Example questions:
39
- * What are examples of latch failures which have occurred due to improper fitup?
40
- * What are examples of lubricants which should be avoided for space mechanism applications?
41
- ''')
42
- filter_toggle=st.checkbox('Filter response with last received sources?')
43
-
44
- sb=setup.load_sidebar(config_file='../config/config.json',
45
- index_data_file='../config/index_data.json',
46
- vector_databases=True,
47
- embeddings=True,
48
- rag_type=True,
49
- index_name=True,
50
- llm=True,
51
- model_options=True,
52
- secret_keys=True)
53
-
54
- secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
55
-
56
- # Set up chat history
57
- if 'qa_model_obj' not in st.session_state:
58
- st.session_state.qa_model_obj = []
59
- if 'message_id' not in st.session_state:
60
- st.session_state.message_id = 0
61
- if 'messages' not in st.session_state:
62
- st.session_state.messages = []
63
- for message in st.session_state.messages:
64
- with st.chat_message(message['role']):
65
- st.markdown(message['content'])
66
-
67
- # Define chat
68
- if prompt := st.chat_input('Prompt here'):
69
- # User prompt
70
- st.session_state.messages.append({'role': 'user', 'content': prompt})
71
- with st.chat_message('user'):
72
- st.markdown(prompt)
73
- # Assistant response
74
- with st.chat_message('assistant'):
75
- message_placeholder = st.empty()
76
-
77
- with st.status('Generating response...') as status:
78
- t_start=time.time()
79
-
80
- st.session_state.message_id += 1
81
- st.write('Starting reponse generation for message: '+str(st.session_state.message_id))
82
- logging.info('Starting reponse generation for message: '+str(st.session_state.message_id))
83
-
84
- # Process some items
85
- if sb['model_options']['output_level'] == 'Concise':
86
- out_token = 50
87
- else:
88
- out_token = 516
89
- logging.info('Output tokens: '+str(out_token))
90
-
91
- if st.session_state.message_id==1:
92
- # Define embeddings
93
- if sb['query_model']=='Openai':
94
- query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
95
- elif sb['query_model']=='Voyage':
96
- query_model=VoyageEmbeddings(model=sb['embedding_name'],voyage_api_key=secrets['VOYAGE_API_KEY'])
97
- elif sb['index_type']=='RAGatouille':
98
- query_model=RAGPretrainedModel.from_index(sb['keys']['LOCAL_DB_PATH']+'/.ragatouille/colbert/indexes/'+sb['index_name'])
99
- logging.info('Query model set: '+str(query_model))
100
-
101
- # Define LLM
102
- if sb['llm_source']=='OpenAI':
103
- llm = ChatOpenAI(model_name=sb['llm_model'],
104
- temperature=sb['model_options']['temperature'],
105
- openai_api_key=secrets['OPENAI_API_KEY'],
106
- max_tokens=out_token)
107
- elif sb['llm_source']=='Hugging Face':
108
- llm = HuggingFaceHub(repo_id=sb['llm_model'],
109
- model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
110
- logging.info('LLM model set: '+str(llm))
111
-
112
- # Initialize QA model object
113
- if 'search_type' in sb['model_options']:
114
- search_type=sb['model_options']['search_type']
115
- else:
116
- search_type=None
117
- st.session_state.qa_model_obj=queries.QA_Model(sb['index_type'],
118
- sb['index_name'],
119
- query_model,
120
- llm,
121
- k=sb['model_options']['k'],
122
- search_type=search_type,
123
- filter_arg=False,
124
- local_db_path=sb['keys']['LOCAL_DB_PATH'])
125
- logging.info('QA model object set: '+str(st.session_state.qa_model_obj))
126
- if st.session_state.message_id>1:
127
- logging.info('Updating model with sidebar settings...')
128
- # Update LLM
129
- if sb['llm_source']=='OpenAI':
130
- llm = ChatOpenAI(model_name=sb['llm_model'],
131
- temperature=sb['model_options']['temperature'],
132
- openai_api_key=secrets['OPENAI_API_KEY'],
133
- max_tokens=out_token)
134
- elif sb['llm_source']=='Hugging Face':
135
- llm = HuggingFaceHub(repo_id=sb['llm_model'],
136
- model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
137
- logging.info('LLM model set: '+str(llm))
138
-
139
- st.session_state.qa_model_obj.update_model(llm,
140
- k=sb['model_options']['k'],
141
- search_type=sb['model_options']['search_type'],
142
- filter_arg=filter_toggle)
143
- logging.info('QA model object updated: '+str(st.session_state.qa_model_obj))
144
-
145
- st.write('Searching vector database, generating prompt...')
146
- logging.info('Searching vector database, generating prompt...')
147
- st.session_state.qa_model_obj.query_docs(prompt)
148
- ai_response=st.session_state.qa_model_obj.result['answer'].content
149
- message_placeholder.markdown(ai_response)
150
- t_delta=time.time() - t_start
151
- status.update(label='Prompt generated in '+"{:10.3f}".format(t_delta)+' seconds', state='complete', expanded=False)
152
-
153
- st.session_state.messages.append({'role': 'assistant', 'content': ai_response})
154
- logging.info(f'Messaging complete for {st.session_state.message_id}.')
155
-
156
- # Add reset button
157
- if st.button('Restart session'):
158
- st.session_state.qa_model_obj = []
159
- st.session_state.message_id = 0
160
- st.session_state.messages = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/pages/2_Document_Upload.py DELETED
@@ -1,112 +0,0 @@
1
- import data_import, setup
2
-
3
- import os
4
- import time
5
- import logging
6
- import glob
7
-
8
- from langchain_openai import OpenAIEmbeddings
9
- from langchain_community.embeddings import VoyageEmbeddings
10
-
11
- from ragatouille import RAGPretrainedModel
12
-
13
- import streamlit as st
14
-
15
- # Set up the page, enable logging, read environment variables
16
- from dotenv import load_dotenv,find_dotenv
17
- load_dotenv(find_dotenv(),override=True)
18
- logging.basicConfig(filename='app_2_document_upload.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
19
-
20
- # Set the page title
21
- st.set_page_config(
22
- page_title='Upload PDFs',
23
- layout='wide'
24
- )
25
- st.title('Upload PDFs')
26
-
27
- sb=setup.load_sidebar(config_file='../config/config.json',
28
- index_data_file='../config/index_data.json',
29
- vector_databases=True,
30
- embeddings=True,
31
- index_name=True,
32
- secret_keys=True)
33
- secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
34
-
35
- # Populate the main screen
36
- logging.info(f'index_type test, {sb["index_type"]}')
37
-
38
- if sb["index_type"]=='RAGatouille':
39
- logging.info('Set hugging face model for queries.')
40
- query_model=sb['query_model']
41
- elif sb['query_model']=='Openai' or 'Voyage':
42
- logging.info('Set embeddings model for queries.')
43
- if sb['query_model']=='Openai':
44
- query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
45
- elif sb['query_model']=='Voyage':
46
- query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
47
- logging.info('Query model set: '+str(query_model))
48
-
49
- # Find docs
50
- index_name_md=st.markdown('Enter a directory relative to the current directory, or an absolute path.')
51
- data_folder = st.text_input('Enter a directory','../data/AMS/')
52
- if not os.path.isdir(data_folder):
53
- st.error('The entered directory does not exist')
54
- docs = glob.glob(data_folder+'*.pdf') # Only get the PDFs in the directory
55
- st.markdown('PDFs found: '+str(docs))
56
- st.markdown('Number of PDFs found: ' + str(len(docs)))
57
- logging.info('Docs: '+str(docs))
58
-
59
- # Add an expandable box for options
60
- with st.expander("Options"):
61
- use_json = st.checkbox('Use existing jsonl, if available (will ignore chunk method, size, and overlap)?', value=True)
62
- json_file=st.text_input('Jsonl file',data_folder+'ams_data.jsonl')
63
- clear_database = st.checkbox('Clear existing database?')
64
- chunk_method= st.selectbox('Chunk method', ['tiktoken_recursive'], index=0)
65
- if sb['query_model']=='Openai' or 'ChromaDB':
66
- # OpenAI will time out if the batch size is too large
67
- batch_size=st.number_input('Batch size for upsert', min_value=1, step=1, value=100)
68
- else:
69
- batch_size=None
70
- if chunk_method=='tiktoken_recursive':
71
- chunk_size=st.number_input('Chunk size (tokens)', min_value=1, step=1, value=500)
72
- chunk_overlap=st.number_input('Chunk overlap (tokens)', min_value=0, step=1, value=0)
73
- else:
74
- raise NotImplementedError
75
-
76
- # Add a button to run the function
77
- if st.button('Chunk docs to jsonl file'):
78
- start_time = time.time() # Start the timer
79
- data_import.chunk_docs(docs,
80
- file=json_file,
81
- chunk_method=chunk_method,
82
- chunk_size=chunk_size,
83
- chunk_overlap=chunk_overlap,
84
- use_json=False)
85
- end_time = time.time() # Stop the timer
86
- elapsed_time = end_time - start_time
87
- st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
88
- if st.button('Load docs into vector database'):
89
- start_time = time.time() # Start the timer
90
- data_import.load_docs(sb['index_type'],
91
- docs,
92
- query_model=query_model,
93
- index_name=sb['index_name'],
94
- chunk_size=chunk_size,
95
- chunk_overlap=chunk_overlap,
96
- use_json=use_json,
97
- clear=clear_database,
98
- file=json_file,
99
- batch_size=batch_size,
100
- local_db_path=sb['keys']['LOCAL_DB_PATH'])
101
- end_time = time.time() # Stop the timer
102
- elapsed_time = end_time - start_time
103
- st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
104
- # Add a button to delete the index
105
- if st.button('Delete existing index'):
106
- start_time = time.time() # Start the timer
107
- data_import.delete_index(sb['index_type'],
108
- sb['index_name'],
109
- local_db_path=sb['keys']['LOCAL_DB_PATH'])
110
- end_time = time.time() # Stop the timer
111
- elapsed_time = end_time - start_time
112
- st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/pages/3_Visualize_Data.py DELETED
@@ -1,123 +0,0 @@
1
- import setup
2
-
3
- import time
4
- import logging
5
- from datetime import datetime
6
-
7
- from langchain_openai import OpenAIEmbeddings
8
- from langchain_community.embeddings import VoyageEmbeddings
9
-
10
- from ragxplorer import RAGxplorer
11
-
12
- import streamlit as st
13
-
14
- # Set up the page, enable logging, read environment variables
15
- from dotenv import load_dotenv,find_dotenv
16
- load_dotenv(find_dotenv(),override=True)
17
- logging.basicConfig(filename='app_3_visualize_data.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
18
-
19
- # Set the page title
20
- st.set_page_config(
21
- page_title='Visualize Data',
22
- layout='wide'
23
- )
24
- st.title('Visualize Data')
25
-
26
- sb=setup.load_sidebar(config_file='../config/config.json',
27
- index_data_file='../config/index_data.json',
28
- vector_databases=True,
29
- embeddings=True,
30
- index_name=True,
31
- secret_keys=True)
32
- secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
33
-
34
- # Set up session state variables
35
- if 'client' not in st.session_state:
36
- st.session_state.client = None
37
-
38
- # Populate the main screen
39
- logging.info(f'index_type test, {sb["index_type"]}')
40
-
41
- if sb["index_type"]=='RAGatouille':
42
- raise Exception('Only index type ChromaDB is supported for this function.')
43
- elif sb["index_type"]=='Pinecone':
44
- raise Exception('Only index type ChromaDB is supported for this function.')
45
- elif sb['query_model']=='Openai' or 'Voyage':
46
- logging.info('Set embeddings model for queries.')
47
- if sb['query_model']=='Openai':
48
- query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
49
- elif sb['query_model']=='Voyage':
50
- query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
51
- logging.info('Query model set: '+str(query_model))
52
-
53
- st.info('You must have created a database using Document Upload in ChromaDB for this to work.')
54
-
55
- # Add an expandable with description of what's going on.
56
- with st.expander("Under the hood",expanded=True):
57
- st.markdown('''
58
- Uses modified version of https://github.com/gabrielchua/RAGxplorer/tree/main?tab=readme-ov-file to connect to existing database created.
59
- Modified version here: https://github.com/dsmueller3760/RAGxplorer/tree/load_db
60
- Assumes that chroma databases are located in local_db_path variable.
61
- Query size in database: Take a random sample of this size from the database to visualize.
62
- ''')
63
-
64
- with st.expander("Create visualization data",expanded=True):
65
- # Add a button to run the function
66
- limit_size = st.checkbox('Limit size of data visualization?', value=True)
67
- if limit_size:
68
- vector_qty=st.number_input('Query size in database', min_value=1, step=10, value=50)
69
- else:
70
- vector_qty=None
71
- export_df = st.checkbox('Export visualization data?', value=True)
72
- if export_df:
73
- current_time = datetime.now().strftime("%Y.%m.%d.%H.%M")
74
- if limit_size:
75
- df_export_path = st.text_input('Export file', f'../data/AMS/ams_data-400-0-{vector_qty}.json')
76
- else:
77
- df_export_path=st.text_input('Export file', f'../data/AMS/ams_data-400-0-all.json')
78
- if st.button('Create visualization data'):
79
- start_time = time.time() # Start the timer
80
-
81
- st.session_state.client = RAGxplorer(embedding_model=sb['embedding_name'])
82
- st.session_state.client.load_db(path_to_db=sb['keys']['LOCAL_DB_PATH']+'/chromadb/',
83
- index_name=sb['index_name'],
84
- df_export_path=df_export_path,
85
- vector_qty=vector_qty,
86
- umap_params={'n_neighbors': 5,
87
- 'n_components': 2,
88
- 'random_state':42},
89
- verbose=True)
90
-
91
- end_time = time.time() # Stop the timer
92
- elapsed_time = end_time - start_time
93
- st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
94
-
95
- with st.expander("Visualize data",expanded=True):
96
- import_data = st.checkbox('Import visualization data?', value=True)
97
- if import_data:
98
- import_file = st.file_uploader("Import file", type="json")
99
- if import_file is None:
100
- # Use a default file
101
- import_file_path=st.text_input('Import file',df_export_path)
102
- else:
103
- # Use the uploaded file
104
- import_file_path=st.text_input('Import file',f'../data/AMS/{import_file.name}')
105
- else:
106
- import_file_path=None
107
-
108
- query = st.text_input('Query', 'What are examples of lubricants which should be avoided for space mechanism applications?')
109
-
110
- if st.button('Visualize data'):
111
- start_time = time.time() # Start the timer
112
-
113
- if st.session_state.client is None:
114
- st.session_state.client = RAGxplorer(embedding_model=sb['embedding_name'])
115
-
116
- fig = st.session_state.client.visualize_query(query,
117
- path_to_db=sb['keys']['LOCAL_DB_PATH']+'/chromadb/',
118
- viz_data_df_path=import_file_path,
119
- verbose=True)
120
- st.plotly_chart(fig,use_container_width=True)
121
-
122
- end_time = time.time() # Stop the timer
123
- elapsed_time = end_time - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/pages/4_Clean_and_Question.py DELETED
@@ -1,86 +0,0 @@
1
- import setup
2
- import data_import
3
-
4
- import time
5
- import logging
6
- import json
7
- from datetime import datetime
8
-
9
- from langchain_openai import OpenAIEmbeddings
10
- from langchain_community.embeddings import VoyageEmbeddings
11
-
12
- from langchain_openai import OpenAI, ChatOpenAI
13
- from langchain_community.llms import HuggingFaceHub
14
-
15
- import streamlit as st
16
-
17
- # Set up the page, enable logging
18
- from dotenv import load_dotenv,find_dotenv
19
- load_dotenv(find_dotenv(),override=True)
20
- logging.basicConfig(filename='app_4_clean_and_question.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
21
-
22
- # Set the page title
23
- st.set_page_config(
24
- page_title='Clean and Question Data',
25
- layout='wide'
26
- )
27
- st.title('Clean and Question Data')
28
- # TODO: add database status icons
29
- sb=setup.load_sidebar(config_file='../config/config.json',
30
- index_data_file='../config/index_data.json',
31
- llm=True,
32
- model_options=True,
33
- secret_keys=True)
34
- secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
35
-
36
- # This is janky but works (needs secrets to initialize properly)
37
- from ragxplorer import RAGxplorer
38
-
39
- # Set up session state variables
40
- if 'client' not in st.session_state:
41
- st.session_state.client = None
42
-
43
- # Populate the main screen
44
- # Add an expandable with description of what's going on.
45
- with st.expander("Under the hood",expanded=True):
46
- st.markdown('''
47
-
48
- ''')
49
-
50
- chunked_file = st.text_input('Chunked raw text file', f'../data/AMS/ams_data-400-0.jsonl')
51
-
52
- with st.expander("Process Chunked Data",expanded=True):
53
- clean_data = st.checkbox('Clean data?', value=True)
54
- tag_data = st.checkbox('Tag data?', value=True)
55
- question_data = st.checkbox('Generate questions from data?', value=True)
56
- if sb['model_options']['output_level'] == 'Concise':
57
- out_token = 50
58
- else:
59
- out_token = 516
60
-
61
- # Define LLM
62
- if sb['llm_source']=='OpenAI':
63
- llm = ChatOpenAI(model_name=sb['llm_model'],
64
- temperature=sb['model_options']['temperature'],
65
- openai_api_key=secrets['OPENAI_API_KEY'],
66
- max_tokens=out_token)
67
- elif sb['llm_source']=='Hugging Face':
68
- llm = HuggingFaceHub(repo_id=sb['llm_model'],
69
- model_kwargs={"temperature": sb['model_options']['temperature'], "max_length": out_token})
70
-
71
- if clean_data or tag_data or question_data:
72
- param_cleaning=None
73
- if clean_data:
74
- n_tags=None
75
- if question_data:
76
- n_questions=None
77
-
78
- if st.button('Process chunked data'):
79
- start_time = time.time() # Start the timer
80
-
81
- data_import.process_chunk(chunked_file,llm,
82
- clean_data=False,tag_data=False,question_data=False)
83
-
84
- end_time = time.time() # Stop the timer
85
- elapsed_time = end_time - start_time
86
- st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/prompts.py DELETED
@@ -1,12 +0,0 @@
1
- from langchain import hub
2
- from langchain.prompts.prompt import PromptTemplate
3
-
4
- # Prompts on the hub: https://smith.langchain.com/hub/my-prompts?organizationId=45eb8917-7353-4296-978d-bb461fc45c65
5
- CONDENSE_QUESTION_PROMPT = hub.pull("dmueller/ams-chatbot-qa-condense-history")
6
- QA_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval")
7
- QA_WSOURCES_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval-wsources")
8
- QA_GENERATE_PROMPT=hub.pull("dmueller/generate_qa_prompt")
9
-
10
- # Prompts defined here only
11
- DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
12
- TEST_QUERY_PROMPT='What are examples of adhesives to use when potting motors for launch vehicle or spacecraft mechanisms?'
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/queries.py DELETED
@@ -1,278 +0,0 @@
1
- import os
2
- import logging
3
- import re
4
-
5
- from dotenv import load_dotenv, find_dotenv
6
-
7
- import openai
8
- import pinecone
9
- import chromadb
10
-
11
- from langchain_community.vectorstores import Pinecone
12
- from langchain_community.vectorstores import Chroma
13
-
14
- from langchain.memory import ConversationBufferMemory
15
-
16
- from operator import itemgetter
17
- from langchain_core.output_parsers import StrOutputParser
18
- from langchain_core.runnables import RunnableLambda, RunnablePassthrough
19
- from langchain.schema import format_document
20
- from langchain_core.messages import get_buffer_string
21
-
22
- from prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT, DEFAULT_DOCUMENT_PROMPT, TEST_QUERY_PROMPT
23
-
24
- # Set secrets from environment file
25
- OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
26
- VOYAGE_API_KEY=os.getenv('VOYAGE_API_KEY')
27
- PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
28
- HUGGINGFACEHUB_API_TOKEN=os.getenv('HUGGINGFACEHUB_API_TOKEN')
29
-
30
- # Class and functions
31
- class QA_Model:
32
- def __init__(self,
33
- index_type,
34
- index_name,
35
- query_model,
36
- llm,
37
- k=6,
38
- search_type='similarity',
39
- fetch_k=50,
40
- temperature=0,
41
- chain_type='stuff',
42
- filter_arg=False,
43
- local_db_path='../db'):
44
-
45
- self.index_type=index_type
46
- self.index_name=index_name
47
- self.query_model=query_model
48
- self.llm=llm
49
- self.k=k
50
- self.search_type=search_type
51
- self.fetch_k=fetch_k
52
- self.temperature=temperature
53
- self.chain_type=chain_type
54
- self.filter_arg=filter_arg
55
- self.sources=[]
56
-
57
- load_dotenv(find_dotenv(),override=True)
58
-
59
- # Define retriever search parameters
60
- search_kwargs = _process_retriever_args(self.filter_arg,
61
- self.sources,
62
- self.search_type,
63
- self.k,
64
- self.fetch_k)
65
-
66
- # Read in from the vector database
67
- if index_type=='Pinecone':
68
- pinecone.init(
69
- api_key=PINECONE_API_KEY
70
- )
71
- logging.info('Chat pinecone index name: '+str(index_name))
72
- logging.info('Chat query model: '+str(query_model))
73
- index = pinecone.Index(index_name)
74
- self.vectorstore = Pinecone(index,query_model,'page_content')
75
- logging.info('Chat vectorstore: '+str(self.vectorstore))
76
-
77
- # Test query
78
- try:
79
- test_query = self.vectorstore.similarity_search(TEST_QUERY_PROMPT)
80
- except:
81
- raise Exception("Pinecone vector database is not configured properly. Test query failed. Likely the index does not exist.")
82
- logging.info('Test query: '+str(test_query))
83
- if not test_query:
84
- raise ValueError("Pinecone vector database is not configured properly. Test query failed.")
85
- else:
86
- logging.info('Test query succeeded!')
87
-
88
- self.retriever=self.vectorstore.as_retriever(search_type=search_type,
89
- search_kwargs=search_kwargs)
90
- logging.info('Chat retriever: '+str(self.retriever))
91
- elif index_type=='ChromaDB':
92
- logging.info('Chat chroma index name: '+str(index_name))
93
- logging.info('Chat query model: '+str(query_model))
94
- persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')
95
- self.vectorstore = Chroma(client=persistent_client,
96
- collection_name=index_name,
97
- embedding_function=query_model)
98
- logging.info('Chat vectorstore: '+str(self.vectorstore))
99
-
100
- # Test query
101
- try:
102
- test_query = self.vectorstore.similarity_search(TEST_QUERY_PROMPT)
103
- except:
104
- raise Exception("Chroma vector database is not configured properly. Test query failed. Likely the index does not exist.")
105
- logging.info('Test query: '+str(test_query))
106
- if not test_query:
107
- raise ValueError("Chroma vector database is not configured properly. Test query failed.")
108
- else:
109
- logging.info('Test query succeeded!')
110
-
111
- self.retriever=self.vectorstore.as_retriever(search_type=search_type,
112
- search_kwargs=search_kwargs)
113
- logging.info('Chat retriever: '+str(self.retriever))
114
- elif index_type=='RAGatouille':
115
- # Easy because the index is picked up directly.
116
- self.vectorstore=query_model
117
- logging.info('Chat query model:'+str(query_model))
118
-
119
- # Test query
120
- try:
121
- test_query = self.vectorstore.search(TEST_QUERY_PROMPT)
122
- except:
123
- raise Exception("RAGatouille vector database is not configured properly.")
124
- logging.info('Test query: '+str(test_query))
125
- if not test_query:
126
- raise ValueError("Chroma vector database is not configured properly. Test query failed.")
127
- else:
128
- logging.info('Test query succeeded!')
129
-
130
- self.retriever=self.vectorstore.as_langchain_retriever()
131
- logging.info('Chat retriever: '+str(self.retriever))
132
-
133
- # Intialize memory
134
- self.memory = ConversationBufferMemory(
135
- return_messages=True, output_key='answer', input_key='question')
136
- logging.info('Memory: '+str(self.memory))
137
-
138
- # Assemble main chain
139
- self.conversational_qa_chain=_define_qa_chain(self.llm,
140
- self.retriever,
141
- self.memory,
142
- self.search_type,
143
- search_kwargs)
144
- def query_docs(self,query):
145
- self.memory.load_memory_variables({})
146
- logging.info('Memory content before qa result: '+str(self.memory))
147
-
148
- logging.info('Query: '+str(query))
149
- self.result = self.conversational_qa_chain.invoke({'question': query})
150
- logging.info('QA result: '+str(self.result))
151
-
152
- if self.index_type!='RAGatouille':
153
- self.sources = '\n'.join(str(data.metadata) for data in self.result['references'])
154
- self.result['answer'].content += '\nSources: \n'+self.sources
155
- logging.info('Sources: '+str(self.sources))
156
- logging.info('Response with sources: '+str(self.result['answer'].content))
157
- else:
158
- # RAGatouille doesn't have metadata, need to extract from context first.
159
- extracted_metadata = []
160
- pattern = r'\{([^}]*)\}(?=[^{}]*$)' # Regular expression pattern to match the last curly braces
161
-
162
- for ref in self.result['references']:
163
- match = re.search(pattern, ref.page_content)
164
- if match:
165
- extracted_metadata.append("{"+match.group(1)+"}")
166
- self.sources = '\n'.join(extracted_metadata)
167
- self.result['answer'].content += '\nSources: \n'+self.sources
168
- logging.info('Sources: '+str(self.sources))
169
- logging.info('Response with sources: '+str(self.result['answer'].content))
170
-
171
- self.memory.save_context({'question': query}, {'answer': self.result['answer'].content})
172
- logging.info('Memory content after qa result: '+str(self.memory))
173
-
174
- def update_model(self,
175
- llm,
176
- k=6,
177
- search_type='similarity',
178
- fetch_k=50,
179
- filter_arg=False):
180
-
181
- self.llm=llm
182
- self.k=k
183
- self.search_type=search_type
184
- self.fetch_k=fetch_k
185
- self.filter_arg=filter_arg
186
-
187
- # Define retriever search parameters
188
- search_kwargs = _process_retriever_args(self.filter_arg,
189
- self.sources,
190
- self.search_type,
191
- self.k,
192
- self.fetch_k)
193
- # Update conversational retrieval chain
194
- self.conversational_qa_chain=_define_qa_chain(self.llm,
195
- self.retriever,
196
- self.memory,
197
- self.search_type,
198
- search_kwargs)
199
- logging.info('Updated qa chain: '+str(self.conversational_qa_chain))
200
-
201
- # Internal functions
202
- def _combine_documents(docs,
203
- document_prompt=DEFAULT_DOCUMENT_PROMPT,
204
- document_separator='\n\n'):
205
- '''
206
- Combine a list of documents into a single string.
207
- '''
208
- # TODO: this would be where stuff, map reduce, etc. would go
209
- doc_strings = [format_document(doc, document_prompt) for doc in docs]
210
- return document_separator.join(doc_strings)
211
- def _define_qa_chain(llm,
212
- retriever,
213
- memory,
214
- search_type,
215
- search_kwargs):
216
- '''
217
- Define the conversational QA chain.
218
- '''
219
- # This adds a 'memory' key to the input object
220
- loaded_memory = RunnablePassthrough.assign(
221
- chat_history=RunnableLambda(memory.load_memory_variables)
222
- | itemgetter('history'))
223
- logging.info('Loaded memory: '+str(loaded_memory))
224
-
225
- # Assemble main chain
226
- standalone_question = {
227
- 'standalone_question': {
228
- 'question': lambda x: x['question'],
229
- 'chat_history': lambda x: get_buffer_string(x['chat_history'])}
230
- | CONDENSE_QUESTION_PROMPT
231
- | llm
232
- | StrOutputParser()}
233
- logging.info('Condense inputs as a standalong question: '+str(standalone_question))
234
- retrieved_documents = {
235
- 'source_documents': itemgetter('standalone_question')
236
- | retriever,
237
- 'question': lambda x: x['standalone_question']}
238
- logging.info('Retrieved documents: '+str(retrieved_documents))
239
- # Now we construct the inputs for the final prompt
240
- final_inputs = {
241
- 'context': lambda x: _combine_documents(x['source_documents']),
242
- 'question': itemgetter('question')}
243
- logging.info('Combined documents: '+str(final_inputs))
244
- # And finally, we do the part that returns the answers
245
- answer = {
246
- 'answer': final_inputs
247
- | QA_PROMPT
248
- | llm,
249
- 'references': itemgetter('source_documents')}
250
- conversational_qa_chain = loaded_memory | standalone_question | retrieved_documents | answer
251
- logging.info('Conversational QA chain: '+str(conversational_qa_chain))
252
- return conversational_qa_chain
253
- def _process_retriever_args(filter_arg,
254
- sources,
255
- search_type,
256
- k,
257
- fetch_k):
258
- '''
259
- Process arguments for retriever.
260
- '''
261
- # Implement filter
262
- if filter_arg:
263
- filter_list = list(set(item['source'] for item in sources[-1]))
264
- filter_items=[]
265
- for item in filter_list:
266
- filter_item={'source': item}
267
- filter_items.append(filter_item)
268
- filter={'$or':filter_items}
269
- else:
270
- filter=None
271
-
272
- # Impement filtering and number of documents to return
273
- if search_type=='mmr':
274
- search_kwargs={'k':k,'fetch_k':fetch_k,'filter':filter} # See as_retriever docs for parameters
275
- else:
276
- search_kwargs={'k':k,'filter':filter} # See as_retriever docs for parameters
277
-
278
- return search_kwargs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/setup.py DELETED
@@ -1,168 +0,0 @@
1
- import os
2
- import logging
3
- import json
4
-
5
- import openai
6
-
7
- import streamlit as st
8
-
9
- # Set up the page, enable logging
10
- from dotenv import load_dotenv,find_dotenv
11
- load_dotenv(find_dotenv(),override=True)
12
-
13
- def load_sidebar(config_file,
14
- index_data_file,
15
- vector_databases=False,
16
- embeddings=False,
17
- rag_type=False,
18
- index_name=False,
19
- llm=False,
20
- model_options=False,
21
- secret_keys=False):
22
- """
23
- Sets up the sidebar based no toggled options. Returns variables with options.
24
- """
25
- sb_out={}
26
- with open(config_file, 'r') as f:
27
- config = json.load(f)
28
- databases = {db['name']: db for db in config['databases']}
29
- llms = {m['name']: m for m in config['llms']}
30
- logging.info('Loaded: '+config_file)
31
- with open(index_data_file, 'r') as f:
32
- index_data = json.load(f)
33
- logging.info('Loaded: '+index_data_file)
34
-
35
- if vector_databases:
36
- # Vector databases
37
- st.sidebar.title('Vector database')
38
- sb_out['index_type']=st.sidebar.selectbox('Index type', list(databases.keys()), index=1)
39
- logging.info('Index type: '+sb_out['index_type'])
40
-
41
- if embeddings:
42
- # Embeddings
43
- st.sidebar.title('Embeddings')
44
- if sb_out['index_type']=='RAGatouille': # Default to selecting hugging face model for RAGatouille, otherwise select alternates
45
- sb_out['query_model']=st.sidebar.selectbox('Hugging face rag models', databases[sb_out['index_type']]['hf_rag_models'], index=0)
46
- else:
47
- sb_out['query_model']=st.sidebar.selectbox('Embedding models', databases[sb_out['index_type']]['embedding_models'], index=0)
48
-
49
- if sb_out['query_model']=='Openai':
50
- sb_out['embedding_name']='text-embedding-ada-002'
51
- elif sb_out['query_model']=='Voyage':
52
- sb_out['embedding_name']='voyage-02'
53
- logging.info('Query type: '+sb_out['query_model'])
54
- if 'embedding_name' in locals() or 'embedding_name' in globals():
55
- logging.info('Embedding name: '+sb_out['embedding_name'])
56
- if rag_type:
57
- if sb_out['index_type']!='RAGatouille': # RAGatouille doesn't have a rag_type
58
- # RAG Type
59
- st.sidebar.title('RAG Type')
60
- sb_out['rag_type']=st.sidebar.selectbox('RAG type', config['rag_types'], index=0)
61
- sb_out['smart_agent']=st.sidebar.checkbox('Smart agent?')
62
- logging.info('RAG type: '+sb_out['rag_type'])
63
- logging.info('Smart agent: '+str(sb_out['smart_agent']))
64
- if index_name:
65
- # Index Name
66
- st.sidebar.title('Index Name')
67
- sb_out['index_name']=index_data[sb_out['index_type']][sb_out['query_model']]
68
- st.sidebar.markdown('Index name: '+sb_out['index_name'])
69
- logging.info('Index name: '+sb_out['index_name'])
70
- if llm:
71
- # LLM
72
- st.sidebar.title('LLM')
73
- sb_out['llm_source']=st.sidebar.selectbox('LLM model', list(llms.keys()), index=0)
74
- logging.info('LLM source: '+sb_out['llm_source'])
75
- if sb_out['llm_source']=='OpenAI':
76
- sb_out['llm_model']=st.sidebar.selectbox('OpenAI model', llms[sb_out['llm_source']]['models'], index=0)
77
- if sb_out['llm_source']=='Hugging Face':
78
- sb_out['llm_model']=st.sidebar.selectbox('Hugging Face model', llms[sb_out['llm_source']]['models'], index=0)
79
- if model_options:
80
- # Add input fields in the sidebar
81
- st.sidebar.title('LLM Options')
82
- temperature = st.sidebar.slider('Temperature', min_value=0.0, max_value=2.0, value=0.0, step=0.1)
83
- output_level = st.sidebar.selectbox('Level of Output', ['Concise', 'Detailed'], index=1)
84
-
85
- if 'index_type' in sb_out:
86
- st.sidebar.title('Retrieval Options')
87
- k = st.sidebar.number_input('Number of items per prompt', min_value=1, step=1, value=4)
88
- if sb_out['index_type']!='RAGatouille':
89
- search_type = st.sidebar.selectbox('Search Type', ['similarity', 'mmr'], index=0)
90
- sb_out['model_options']={'output_level':output_level,
91
- 'k':k,
92
- 'search_type':search_type,
93
- 'temperature':temperature}
94
- else:
95
- sb_out['model_options']={'output_level':output_level,
96
- 'temperature':temperature}
97
- logging.info('Model options: '+str(sb_out['model_options']))
98
- if secret_keys:
99
- # Add a section for secret keys
100
- st.sidebar.title('Secret keys')
101
- st.sidebar.markdown('If .env file is in directory, will use that first.')
102
- sb_out['keys']={}
103
- if 'llm_source' in sb_out and sb_out['llm_source'] == 'OpenAI':
104
- sb_out['keys']['OPENAI_API_KEY'] = st.sidebar.text_input('OpenAI API Key', type='password')
105
- elif 'query_model' in sb_out and sb_out['query_model'] == 'Openai':
106
- sb_out['keys']['OPENAI_API_KEY'] = st.sidebar.text_input('OpenAI API Key', type='password')
107
- if 'llm_source' in sb_out and sb_out['llm_source']=='Hugging Face':
108
- sb_out['keys']['HUGGINGFACEHUB_API_TOKEN'] = st.sidebar.text_input('Hugging Face API Key', type='password')
109
- if 'query_model' in sb_out and sb_out['query_model']=='Voyage':
110
- sb_out['keys']['VOYAGE_API_KEY'] = st.sidebar.text_input('Voyage API Key', type='password')
111
- if 'index_type' in sb_out and sb_out['index_type']=='Pinecone':
112
- sb_out['keys']['PINECONE_API_KEY']=st.sidebar.text_input('Pinecone API Key',type='password')
113
- if os.getenv('LOCAL_DB_PATH') is None:
114
- sb_out['keys']['LOCAL_DB_PATH'] = st.sidebar.text_input('Local Database Path','/data',help='Path to local database (e.g. chroma)')
115
- os.environ['LOCAL_DB_PATH'] = sb_out['keys']['LOCAL_DB_PATH']
116
- else:
117
- sb_out['keys']['LOCAL_DB_PATH'] = os.getenv('LOCAL_DB_PATH')
118
- st.sidebar.markdown('Local Database Path: '+sb_out['keys']['LOCAL_DB_PATH'],help='From .env file.')
119
-
120
- return sb_out
121
-
122
- def set_secrets(sb):
123
- """
124
- Sets secrets from environment file, or from sidebar if not available.
125
- """
126
- secrets={}
127
-
128
- secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
129
- logging.info('OpenAI API Key: '+str(secrets['OPENAI_API_KEY']))
130
- if not secrets['OPENAI_API_KEY'] and 'keys' in sb and 'OPENAI_API_KEY' in sb['keys']:
131
- logging.info('Setting OpenAI API Key from sidebar...')
132
- secrets['OPENAI_API_KEY'] = sb['keys']['OPENAI_API_KEY']
133
- os.environ['OPENAI_API_KEY'] = secrets['OPENAI_API_KEY']
134
- logging.info('OpenAI API Key: '+str(os.environ['OPENAI_API_KEY']))
135
- if os.environ['OPENAI_API_KEY']=='':
136
- raise Exception('OpenAI API Key is required.')
137
- openai.api_key = secrets['OPENAI_API_KEY']
138
-
139
- secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
140
- logging.info('Voyage API Key: '+str(secrets['VOYAGE_API_KEY']))
141
- if not secrets['VOYAGE_API_KEY'] and 'keys' in sb and 'VOYAGE_API_KEY' in sb['keys']:
142
- logging.info('Setting Voyage API Key from sidebar...')
143
- secrets['VOYAGE_API_KEY'] = sb['keys']['VOYAGE_API_KEY']
144
- os.environ['VOYAGE_API_KEY'] = secrets['VOYAGE_API_KEY']
145
- logging.info('Voyage API Key: '+str(os.environ['VOYAGE_API_KEY']))
146
- if os.environ['VOYAGE_API_KEY']=='':
147
- raise Exception('Voyage API Key is required.')
148
-
149
- secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
150
- logging.info('Pinecone API Key: '+str(secrets['PINECONE_API_KEY']))
151
- if not secrets['PINECONE_API_KEY'] and 'keys' in sb and 'PINECONE_API_KEY' in sb['keys']:
152
- logging.info('Setting Pinecone API Key from sidebar...')
153
- secrets['PINECONE_API_KEY'] = sb['keys']['PINECONE_API_KEY']
154
- os.environ['PINECONE_API_KEY'] = secrets['PINECONE_API_KEY']
155
- logging.info('Pinecone API Key: '+str(os.environ['PINECONE_API_KEY']))
156
- if os.environ['PINECONE_API_KEY']=='':
157
- raise Exception('Pinecone API Key is required.')
158
-
159
- secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
160
- logging.info('Hugging Face API Key: '+str(secrets['HUGGINGFACEHUB_API_TOKEN']))
161
- if not secrets['HUGGINGFACEHUB_API_TOKEN'] and 'keys' in sb and 'HUGGINGFACEHUB_API_TOKEN' in sb['keys']:
162
- logging.info('Setting Hugging Face API Key from sidebar...')
163
- secrets['HUGGINGFACEHUB_API_TOKEN'] = sb['keys']['HUGGINGFACEHUB_API_TOKEN']
164
- os.environ['HUGGINGFACEHUB_API_TOKEN'] = secrets['HUGGINGFACEHUB_API_TOKEN']
165
- logging.info('Hugging Face API Key: '+str(os.environ['HUGGINGFACEHUB_API_TOKEN']))
166
- if os.environ['HUGGINGFACEHUB_API_TOKEN']=='':
167
- raise Exception('Hugging Face API Key is required.')
168
- return secrets