Spaces:
Paused
Paused
kit086 commited on
Commit ·
807eac7
1
Parent(s): 108801c
refactor: 调整 vectordb 位 faiss 并更新依赖
Browse files- .gitignore +1 -0
- agent.py +26 -29
- pyproject.toml +1 -0
- requirements.txt +1 -0
- uv.lock +24 -0
.gitignore
CHANGED
|
@@ -2,3 +2,4 @@
|
|
| 2 |
.env
|
| 3 |
**/__pycache__
|
| 4 |
chroma_db
|
|
|
|
|
|
| 2 |
.env
|
| 3 |
**/__pycache__
|
| 4 |
chroma_db
|
| 5 |
+
faiss_index
|
agent.py
CHANGED
|
@@ -11,7 +11,7 @@ from langchain_community.document_loaders import ArxivLoader
|
|
| 11 |
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
|
| 12 |
from langchain_core.tools import tool
|
| 13 |
from langchain.tools.retriever import create_retriever_tool
|
| 14 |
-
from langchain_community.vectorstores import
|
| 15 |
from langchain_core.documents import Document
|
| 16 |
import shutil
|
| 17 |
import pandas as pd # Ny import för pandas
|
|
@@ -118,24 +118,22 @@ with open("system_prompt.txt", "r", encoding="utf-8") as f:
|
|
| 118 |
system_prompt = f.read()
|
| 119 |
|
| 120 |
# Retrieval
|
| 121 |
-
|
| 122 |
CSV_PATH = "./supabase_docs.csv"
|
| 123 |
-
|
|
|
|
| 124 |
_SIMILARITY_THRESHOLD = 0.2 # lower distance means more similar
|
| 125 |
|
| 126 |
-
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 127 |
|
| 128 |
-
if os.path.exists(
|
| 129 |
-
print(f"Loading existing
|
| 130 |
-
vector_store =
|
| 131 |
-
persist_directory=CHROMA_DIR,
|
| 132 |
-
embedding_function=embeddings,
|
| 133 |
-
)
|
| 134 |
else:
|
| 135 |
-
print(f"Creating new
|
| 136 |
-
if os.path.exists(
|
| 137 |
-
shutil.rmtree(
|
| 138 |
-
os.makedirs(
|
| 139 |
|
| 140 |
if not os.path.exists(CSV_PATH):
|
| 141 |
raise FileNotFoundError(f"CSV file {CSV_PATH} does not exist")
|
|
@@ -157,28 +155,27 @@ else:
|
|
| 157 |
|
| 158 |
documents.append(Document(page_content=question_part, metadata=metadata))
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
if not documents:
|
| 161 |
-
print("No documents loaded from CSV.
|
| 162 |
|
| 163 |
-
vector_store =
|
| 164 |
-
|
| 165 |
-
embedding_function=embeddings
|
| 166 |
-
)
|
| 167 |
else:
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
)
|
| 173 |
-
vector_store.persist()
|
| 174 |
-
print(f"ChromaDB initialized and persisted with {len(documents)} documents from CSV.")
|
| 175 |
|
| 176 |
|
| 177 |
# Retriever tool
|
| 178 |
retriever_tool = create_retriever_tool(
|
| 179 |
-
retriever
|
| 180 |
-
name
|
| 181 |
-
description
|
| 182 |
)
|
| 183 |
|
| 184 |
# Agent
|
|
|
|
| 11 |
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
|
| 12 |
from langchain_core.tools import tool
|
| 13 |
from langchain.tools.retriever import create_retriever_tool
|
| 14 |
+
from langchain_community.vectorstores import FAISS
|
| 15 |
from langchain_core.documents import Document
|
| 16 |
import shutil
|
| 17 |
import pandas as pd # Ny import för pandas
|
|
|
|
| 118 |
system_prompt = f.read()
|
| 119 |
|
| 120 |
# Retrieval
|
| 121 |
+
INDEX_DIR = "./faiss_index"
|
| 122 |
CSV_PATH = "./supabase_docs.csv"
|
| 123 |
+
# Use a lightweight 384-dim model to avoid excessive RAM and potential onnxruntime crashes
|
| 124 |
+
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 125 |
_SIMILARITY_THRESHOLD = 0.2 # lower distance means more similar
|
| 126 |
|
| 127 |
+
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL, model_kwargs={"device": "cpu"})
|
| 128 |
|
| 129 |
+
if os.path.exists(INDEX_DIR):
|
| 130 |
+
print(f"Loading existing FAISS index from {INDEX_DIR}")
|
| 131 |
+
vector_store = FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)
|
|
|
|
|
|
|
|
|
|
| 132 |
else:
|
| 133 |
+
print(f"Creating new FAISS index at {INDEX_DIR}, and loading documents from {CSV_PATH}")
|
| 134 |
+
if os.path.exists(INDEX_DIR):
|
| 135 |
+
shutil.rmtree(INDEX_DIR)
|
| 136 |
+
os.makedirs(INDEX_DIR)
|
| 137 |
|
| 138 |
if not os.path.exists(CSV_PATH):
|
| 139 |
raise FileNotFoundError(f"CSV file {CSV_PATH} does not exist")
|
|
|
|
| 155 |
|
| 156 |
documents.append(Document(page_content=question_part, metadata=metadata))
|
| 157 |
|
| 158 |
+
# Simple progress indicator every 200 docs
|
| 159 |
+
if (i + 1) % 200 == 0:
|
| 160 |
+
print(f"Prepared {i + 1}/{len(df)} documents for embedding…")
|
| 161 |
+
|
| 162 |
if not documents:
|
| 163 |
+
print("No documents loaded from CSV. FAISS index will be empty.")
|
| 164 |
|
| 165 |
+
vector_store = FAISS.from_documents(documents=[], embedding=embeddings)
|
| 166 |
+
vector_store.save_local(INDEX_DIR)
|
|
|
|
|
|
|
| 167 |
else:
|
| 168 |
+
print("Embedding documents and building FAISS index — this may take a few minutes…")
|
| 169 |
+
vector_store = FAISS.from_documents(documents=documents, embedding=embeddings)
|
| 170 |
+
vector_store.save_local(INDEX_DIR)
|
| 171 |
+
print(f"FAISS index built and saved with {len(documents)} documents from CSV.")
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
# Retriever tool
|
| 175 |
retriever_tool = create_retriever_tool(
|
| 176 |
+
retriever=vector_store.as_retriever(),
|
| 177 |
+
name="Question_Search",
|
| 178 |
+
description="Retrieve similar questions from FAISS index; metadata includes 'final_answer'."
|
| 179 |
)
|
| 180 |
|
| 181 |
# Agent
|
pyproject.toml
CHANGED
|
@@ -8,6 +8,7 @@ dependencies = [
|
|
| 8 |
"chromadb>=1.0.15",
|
| 9 |
"ddgs>=9.0.0",
|
| 10 |
"duckduckgo-search>=8.1.1",
|
|
|
|
| 11 |
"gradio[oauth]>=5.36.2",
|
| 12 |
"langchain>=0.3.26",
|
| 13 |
"langchain-chroma>=0.2.4",
|
|
|
|
| 8 |
"chromadb>=1.0.15",
|
| 9 |
"ddgs>=9.0.0",
|
| 10 |
"duckduckgo-search>=8.1.1",
|
| 11 |
+
"faiss-cpu>=1.11.0",
|
| 12 |
"gradio[oauth]>=5.36.2",
|
| 13 |
"langchain>=0.3.26",
|
| 14 |
"langchain-chroma>=0.2.4",
|
requirements.txt
CHANGED
|
@@ -3,6 +3,7 @@ requests
|
|
| 3 |
chromadb
|
| 4 |
ddgs
|
| 5 |
duckduckgo-search
|
|
|
|
| 6 |
langchain
|
| 7 |
langchain-chroma
|
| 8 |
langchain-community
|
|
|
|
| 3 |
chromadb
|
| 4 |
ddgs
|
| 5 |
duckduckgo-search
|
| 6 |
+
faiss-cpu
|
| 7 |
langchain
|
| 8 |
langchain-chroma
|
| 9 |
langchain-community
|
uv.lock
CHANGED
|
@@ -538,6 +538,28 @@ wheels = [
|
|
| 538 |
{ url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922 },
|
| 539 |
]
|
| 540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
[[package]]
|
| 542 |
name = "fastapi"
|
| 543 |
version = "0.116.0"
|
|
@@ -1098,6 +1120,7 @@ dependencies = [
|
|
| 1098 |
{ name = "chromadb" },
|
| 1099 |
{ name = "ddgs" },
|
| 1100 |
{ name = "duckduckgo-search" },
|
|
|
|
| 1101 |
{ name = "gradio", extra = ["oauth"] },
|
| 1102 |
{ name = "langchain" },
|
| 1103 |
{ name = "langchain-chroma" },
|
|
@@ -1115,6 +1138,7 @@ requires-dist = [
|
|
| 1115 |
{ name = "chromadb", specifier = ">=1.0.15" },
|
| 1116 |
{ name = "ddgs", specifier = ">=9.0.0" },
|
| 1117 |
{ name = "duckduckgo-search", specifier = ">=8.1.1" },
|
|
|
|
| 1118 |
{ name = "gradio", extras = ["oauth"], specifier = ">=5.36.2" },
|
| 1119 |
{ name = "langchain", specifier = ">=0.3.26" },
|
| 1120 |
{ name = "langchain-chroma", specifier = ">=0.2.4" },
|
|
|
|
| 538 |
{ url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922 },
|
| 539 |
]
|
| 540 |
|
| 541 |
+
[[package]]
|
| 542 |
+
name = "faiss-cpu"
|
| 543 |
+
version = "1.11.0"
|
| 544 |
+
source = { registry = "https://pypi.org/simple" }
|
| 545 |
+
dependencies = [
|
| 546 |
+
{ name = "numpy" },
|
| 547 |
+
{ name = "packaging" },
|
| 548 |
+
]
|
| 549 |
+
sdist = { url = "https://files.pythonhosted.org/packages/e7/9a/e33fc563f007924dd4ec3c5101fe5320298d6c13c158a24a9ed849058569/faiss_cpu-1.11.0.tar.gz", hash = "sha256:44877b896a2b30a61e35ea4970d008e8822545cb340eca4eff223ac7f40a1db9", size = 70218 }
|
| 550 |
+
wheels = [
|
| 551 |
+
{ url = "https://files.pythonhosted.org/packages/3b/d3/7178fa07047fd770964a83543329bb5e3fc1447004cfd85186ccf65ec3ee/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:356437b9a46f98c25831cdae70ca484bd6c05065af6256d87f6505005e9135b9", size = 3313807 },
|
| 552 |
+
{ url = "https://files.pythonhosted.org/packages/9e/71/25f5f7b70a9f22a3efe19e7288278da460b043a3b60ad98e4e47401ed5aa/faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c4a3d35993e614847f3221c6931529c0bac637a00eff0d55293e1db5cb98c85f", size = 7913537 },
|
| 553 |
+
{ url = "https://files.pythonhosted.org/packages/b0/c8/a5cb8466c981ad47750e1d5fda3d4223c82f9da947538749a582b3a2d35c/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8f9af33e0b8324e8199b93eb70ac4a951df02802a9dcff88e9afc183b11666f0", size = 3785180 },
|
| 554 |
+
{ url = "https://files.pythonhosted.org/packages/7f/37/eaf15a7d80e1aad74f56cf737b31b4547a1a664ad3c6e4cfaf90e82454a8/faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:48b7e7876829e6bdf7333041800fa3c1753bb0c47e07662e3ef55aca86981430", size = 31287630 },
|
| 555 |
+
{ url = "https://files.pythonhosted.org/packages/ff/5c/902a78347e9c47baaf133e47863134e564c39f9afe105795b16ee986b0df/faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:bdc199311266d2be9d299da52361cad981393327b2b8aa55af31a1b75eaaf522", size = 15005398 },
|
| 556 |
+
{ url = "https://files.pythonhosted.org/packages/92/90/d2329ce56423cc61f4c20ae6b4db001c6f88f28bf5a7ef7f8bbc246fd485/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0c98e5feff83b87348e44eac4d578d6f201780dae6f27f08a11d55536a20b3a8", size = 3313807 },
|
| 557 |
+
{ url = "https://files.pythonhosted.org/packages/24/14/8af8f996d54e6097a86e6048b1a2c958c52dc985eb4f935027615079939e/faiss_cpu-1.11.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:796e90389427b1c1fb06abdb0427bb343b6350f80112a2e6090ac8f176ff7416", size = 7913539 },
|
| 558 |
+
{ url = "https://files.pythonhosted.org/packages/b2/2b/437c2f36c3aa3cffe041479fced1c76420d3e92e1f434f1da3be3e6f32b1/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2b6e355dda72b3050991bc32031b558b8f83a2b3537a2b9e905a84f28585b47e", size = 3785181 },
|
| 559 |
+
{ url = "https://files.pythonhosted.org/packages/66/75/955527414371843f558234df66fa0b62c6e86e71e4022b1be9333ac6004c/faiss_cpu-1.11.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6c482d07194638c169b4422774366e7472877d09181ea86835e782e6304d4185", size = 31287635 },
|
| 560 |
+
{ url = "https://files.pythonhosted.org/packages/50/51/35b7a3f47f7859363a367c344ae5d415ea9eda65db0a7d497c7ea2c0b576/faiss_cpu-1.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:13eac45299532b10e911bff1abbb19d1bf5211aa9e72afeade653c3f1e50e042", size = 15005455 },
|
| 561 |
+
]
|
| 562 |
+
|
| 563 |
[[package]]
|
| 564 |
name = "fastapi"
|
| 565 |
version = "0.116.0"
|
|
|
|
| 1120 |
{ name = "chromadb" },
|
| 1121 |
{ name = "ddgs" },
|
| 1122 |
{ name = "duckduckgo-search" },
|
| 1123 |
+
{ name = "faiss-cpu" },
|
| 1124 |
{ name = "gradio", extra = ["oauth"] },
|
| 1125 |
{ name = "langchain" },
|
| 1126 |
{ name = "langchain-chroma" },
|
|
|
|
| 1138 |
{ name = "chromadb", specifier = ">=1.0.15" },
|
| 1139 |
{ name = "ddgs", specifier = ">=9.0.0" },
|
| 1140 |
{ name = "duckduckgo-search", specifier = ">=8.1.1" },
|
| 1141 |
+
{ name = "faiss-cpu", specifier = ">=1.11.0" },
|
| 1142 |
{ name = "gradio", extras = ["oauth"], specifier = ">=5.36.2" },
|
| 1143 |
{ name = "langchain", specifier = ">=0.3.26" },
|
| 1144 |
{ name = "langchain-chroma", specifier = ">=0.2.4" },
|