kit086 commited on
Commit
807eac7
·
1 Parent(s): 108801c

refactor: 调整 vectordb 位 faiss 并更新依赖

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. agent.py +26 -29
  3. pyproject.toml +1 -0
  4. requirements.txt +1 -0
  5. uv.lock +24 -0
.gitignore CHANGED
@@ -2,3 +2,4 @@
2
  .env
3
  **/__pycache__
4
  chroma_db
 
 
2
  .env
3
  **/__pycache__
4
  chroma_db
5
+ faiss_index
agent.py CHANGED
@@ -11,7 +11,7 @@ from langchain_community.document_loaders import ArxivLoader
11
  from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
12
  from langchain_core.tools import tool
13
  from langchain.tools.retriever import create_retriever_tool
14
- from langchain_community.vectorstores import Chroma
15
  from langchain_core.documents import Document
16
  import shutil
17
  import pandas as pd # Ny import för pandas
@@ -118,24 +118,22 @@ with open("system_prompt.txt", "r", encoding="utf-8") as f:
118
  system_prompt = f.read()
119
 
120
  # Retrieval
121
- CHROMA_DIR = "./chroma_db"
122
  CSV_PATH = "./supabase_docs.csv"
123
- EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
 
124
  _SIMILARITY_THRESHOLD = 0.2 # lower distance means more similar
125
 
126
- embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
127
 
128
- if os.path.exists(CHROMA_DIR):
129
- print(f"Loading existing ChromaDB from {CHROMA_DIR}")
130
- vector_store = Chroma(
131
- persist_directory=CHROMA_DIR,
132
- embedding_function=embeddings,
133
- )
134
  else:
135
- print(f"Creating new ChromaDB at {CHROMA_DIR}, and loading documents from {CSV_PATH}")
136
- if os.path.exists(CHROMA_DIR):
137
- shutil.rmtree(CHROMA_DIR)
138
- os.makedirs(CHROMA_DIR)
139
 
140
  if not os.path.exists(CSV_PATH):
141
  raise FileNotFoundError(f"CSV file {CSV_PATH} does not exist")
@@ -157,28 +155,27 @@ else:
157
 
158
  documents.append(Document(page_content=question_part, metadata=metadata))
159
 
 
 
 
 
160
  if not documents:
161
- print("No documents loaded from CSV. ChromaDB will be empty.")
162
 
163
- vector_store = Chroma(
164
- persist_directory=CHROMA_DIR,
165
- embedding_function=embeddings
166
- )
167
  else:
168
- vector_store = Chroma.from_documents(
169
- documents=documents,
170
- embedding=embeddings,
171
- persist_directory=CHROMA_DIR,
172
- )
173
- vector_store.persist()
174
- print(f"ChromaDB initialized and persisted with {len(documents)} documents from CSV.")
175
 
176
 
177
  # Retriever tool
178
  retriever_tool = create_retriever_tool(
179
- retriever = vector_store.as_retriever(),
180
- name = "Question_Search",
181
- description = "A tool to retrieve similar questions from a vector store. The retrieved document's metadata contains the 'final_answer' to the question."
182
  )
183
 
184
  # Agent
 
11
  from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
12
  from langchain_core.tools import tool
13
  from langchain.tools.retriever import create_retriever_tool
14
+ from langchain_community.vectorstores import FAISS
15
  from langchain_core.documents import Document
16
  import shutil
17
  import pandas as pd # Ny import för pandas
 
118
  system_prompt = f.read()
119
 
120
  # Retrieval
121
+ INDEX_DIR = "./faiss_index"
122
  CSV_PATH = "./supabase_docs.csv"
123
+ # Use a lightweight 384-dim model to avoid excessive RAM and potential onnxruntime crashes
124
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
125
  _SIMILARITY_THRESHOLD = 0.2 # lower distance means more similar
126
 
127
+ embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL, model_kwargs={"device": "cpu"})
128
 
129
+ if os.path.exists(INDEX_DIR):
130
+ print(f"Loading existing FAISS index from {INDEX_DIR}")
131
+ vector_store = FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)
 
 
 
132
  else:
133
+ print(f"Creating new FAISS index at {INDEX_DIR}, and loading documents from {CSV_PATH}")
134
+ if os.path.exists(INDEX_DIR):
135
+ shutil.rmtree(INDEX_DIR)
136
+ os.makedirs(INDEX_DIR)
137
 
138
  if not os.path.exists(CSV_PATH):
139
  raise FileNotFoundError(f"CSV file {CSV_PATH} does not exist")
 
155
 
156
  documents.append(Document(page_content=question_part, metadata=metadata))
157
 
158
+ # Simple progress indicator every 200 docs
159
+ if (i + 1) % 200 == 0:
160
+ print(f"Prepared {i + 1}/{len(df)} documents for embedding…")
161
+
162
  if not documents:
163
+ print("No documents loaded from CSV. FAISS index will be empty.")
164
 
165
+ vector_store = FAISS.from_documents(documents=[], embedding=embeddings)
166
+ vector_store.save_local(INDEX_DIR)
 
 
167
  else:
168
+ print("Embedding documents and building FAISS index — this may take a few minutes…")
169
+ vector_store = FAISS.from_documents(documents=documents, embedding=embeddings)
170
+ vector_store.save_local(INDEX_DIR)
171
+ print(f"FAISS index built and saved with {len(documents)} documents from CSV.")
 
 
 
172
 
173
 
174
  # Retriever tool
175
  retriever_tool = create_retriever_tool(
176
+ retriever=vector_store.as_retriever(),
177
+ name="Question_Search",
178
+ description="Retrieve similar questions from FAISS index; metadata includes 'final_answer'."
179
  )
180
 
181
  # Agent
pyproject.toml CHANGED
@@ -8,6 +8,7 @@ dependencies = [
8
  "chromadb>=1.0.15",
9
  "ddgs>=9.0.0",
10
  "duckduckgo-search>=8.1.1",
 
11
  "gradio[oauth]>=5.36.2",
12
  "langchain>=0.3.26",
13
  "langchain-chroma>=0.2.4",
 
8
  "chromadb>=1.0.15",
9
  "ddgs>=9.0.0",
10
  "duckduckgo-search>=8.1.1",
11
+ "faiss-cpu>=1.11.0",
12
  "gradio[oauth]>=5.36.2",
13
  "langchain>=0.3.26",
14
  "langchain-chroma>=0.2.4",
requirements.txt CHANGED
@@ -3,6 +3,7 @@ requests
3
  chromadb
4
  ddgs
5
  duckduckgo-search
 
6
  langchain
7
  langchain-chroma
8
  langchain-community
 
3
  chromadb
4
  ddgs
5
  duckduckgo-search
6
+ faiss-cpu
7
  langchain
8
  langchain-chroma
9
  langchain-community
uv.lock CHANGED
@@ -538,6 +538,28 @@ wheels = [
538
  { url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922 },
539
  ]
540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  [[package]]
542
  name = "fastapi"
543
  version = "0.116.0"
@@ -1098,6 +1120,7 @@ dependencies = [
1098
  { name = "chromadb" },
1099
  { name = "ddgs" },
1100
  { name = "duckduckgo-search" },
 
1101
  { name = "gradio", extra = ["oauth"] },
1102
  { name = "langchain" },
1103
  { name = "langchain-chroma" },
@@ -1115,6 +1138,7 @@ requires-dist = [
1115
  { name = "chromadb", specifier = ">=1.0.15" },
1116
  { name = "ddgs", specifier = ">=9.0.0" },
1117
  { name = "duckduckgo-search", specifier = ">=8.1.1" },
 
1118
  { name = "gradio", extras = ["oauth"], specifier = ">=5.36.2" },
1119
  { name = "langchain", specifier = ">=0.3.26" },
1120
  { name = "langchain-chroma", specifier = ">=0.2.4" },
 
538
  { url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922 },
539
  ]
540
 
541
+ [[package]]
542
+ name = "faiss-cpu"
543
+ version = "1.11.0"
544
+ source = { registry = "https://pypi.org/simple" }
545
+ dependencies = [
546
+ { name = "numpy" },
547
+ { name = "packaging" },
548
+ ]
549
+ sdist = { url = "https://files.pythonhosted.org/packages/e7/9a/e33fc563f007924dd4ec3c5101fe5320298d6c13c158a24a9ed849058569/faiss_cpu-1.11.0.tar.gz", hash = "sha256:44877b896a2b30a61e35ea4970d008e8822545cb340eca4eff223ac7f40a1db9", size = 70218 }
550
+ wheels = [
551
+ { url = "https://files.pythonhosted.org/packages/3b/d3/7178fa07047fd770964a83543329bb5e3fc1447004cfd85186ccf65ec3ee/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:356437b9a46f98c25831cdae70ca484bd6c05065af6256d87f6505005e9135b9", size = 3313807 },
552
+ { url = "https://files.pythonhosted.org/packages/9e/71/25f5f7b70a9f22a3efe19e7288278da460b043a3b60ad98e4e47401ed5aa/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c4a3d35993e614847f3221c6931529c0bac637a00eff0d55293e1db5cb98c85f", size = 7913537 },
553
+ { url = "https://files.pythonhosted.org/packages/b0/c8/a5cb8466c981ad47750e1d5fda3d4223c82f9da947538749a582b3a2d35c/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8f9af33e0b8324e8199b93eb70ac4a951df02802a9dcff88e9afc183b11666f0", size = 3785180 },
554
+ { url = "https://files.pythonhosted.org/packages/7f/37/eaf15a7d80e1aad74f56cf737b31b4547a1a664ad3c6e4cfaf90e82454a8/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:48b7e7876829e6bdf7333041800fa3c1753bb0c47e07662e3ef55aca86981430", size = 31287630 },
555
+ { url = "https://files.pythonhosted.org/packages/ff/5c/902a78347e9c47baaf133e47863134e564c39f9afe105795b16ee986b0df/faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:bdc199311266d2be9d299da52361cad981393327b2b8aa55af31a1b75eaaf522", size = 15005398 },
556
+ { url = "https://files.pythonhosted.org/packages/92/90/d2329ce56423cc61f4c20ae6b4db001c6f88f28bf5a7ef7f8bbc246fd485/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0c98e5feff83b87348e44eac4d578d6f201780dae6f27f08a11d55536a20b3a8", size = 3313807 },
557
+ { url = "https://files.pythonhosted.org/packages/24/14/8af8f996d54e6097a86e6048b1a2c958c52dc985eb4f935027615079939e/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:796e90389427b1c1fb06abdb0427bb343b6350f80112a2e6090ac8f176ff7416", size = 7913539 },
558
+ { url = "https://files.pythonhosted.org/packages/b2/2b/437c2f36c3aa3cffe041479fced1c76420d3e92e1f434f1da3be3e6f32b1/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2b6e355dda72b3050991bc32031b558b8f83a2b3537a2b9e905a84f28585b47e", size = 3785181 },
559
+ { url = "https://files.pythonhosted.org/packages/66/75/955527414371843f558234df66fa0b62c6e86e71e4022b1be9333ac6004c/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6c482d07194638c169b4422774366e7472877d09181ea86835e782e6304d4185", size = 31287635 },
560
+ { url = "https://files.pythonhosted.org/packages/50/51/35b7a3f47f7859363a367c344ae5d415ea9eda65db0a7d497c7ea2c0b576/faiss_cpu-1.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:13eac45299532b10e911bff1abbb19d1bf5211aa9e72afeade653c3f1e50e042", size = 15005455 },
561
+ ]
562
+
563
  [[package]]
564
  name = "fastapi"
565
  version = "0.116.0"
 
1120
  { name = "chromadb" },
1121
  { name = "ddgs" },
1122
  { name = "duckduckgo-search" },
1123
+ { name = "faiss-cpu" },
1124
  { name = "gradio", extra = ["oauth"] },
1125
  { name = "langchain" },
1126
  { name = "langchain-chroma" },
 
1138
  { name = "chromadb", specifier = ">=1.0.15" },
1139
  { name = "ddgs", specifier = ">=9.0.0" },
1140
  { name = "duckduckgo-search", specifier = ">=8.1.1" },
1141
+ { name = "faiss-cpu", specifier = ">=1.11.0" },
1142
  { name = "gradio", extras = ["oauth"], specifier = ">=5.36.2" },
1143
  { name = "langchain", specifier = ">=0.3.26" },
1144
  { name = "langchain-chroma", specifier = ">=0.2.4" },