BookSearch / app.py
shigureui's picture
fix
4b5811c
import gradio as gr
# import llama_cpp
import base64
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad
def decrypt_file(input_path, key):
# 读取加密文件
with open(input_path, "rb") as f:
encrypted_data = base64.b64decode(f.read())
key = key.ljust(32, "0")[:32].encode("utf-8")
iv = encrypted_data[:16]
ciphertext = encrypted_data[16:]
cipher = AES.new(key, AES.MODE_CBC, iv)
plaintext = unpad(cipher.decrypt(ciphertext), AES.block_size)
return plaintext.decode("utf-8")
# llm = llama_cpp.Llama.from_pretrained(
# repo_id="mradermacher/bge-large-zh-v1.5-GGUF",
# filename="bge-large-zh-v1.5.Q4_K_M.gguf",
# embedding=True,
# )
# embedding_1 = llm.create_embedding("Hello, world!")
# embedding_2 = llm.create_embedding("你好, 世界!") # type(embedding_1['data'][0]['embedding']) list
from openai import OpenAI
import os
client_oai = OpenAI(
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" # 百炼服务的base_url
)
from pymilvus import MilvusClient
client = MilvusClient("./books.db")
client.create_collection(collection_name="collection_1", dimension=1024)
import os, json
aeskey = os.getenv("aeskey")
decrypted_content = decrypt_file("encrypted.txt", aeskey)
raw_jsons = json.loads(decrypted_content)
with open("embeddings.json", mode="r") as embedding_file:
all_embs = json.load(embedding_file)
for vhjx_index, vhjx_item in enumerate(raw_jsons):
chapter = vhjx_item[0]
docs = []
metas = []
for jvvi_item in vhjx_item[1:]:
content = jvvi_item["原文"]
docs.append(content)
metas.append(
{
"index": jvvi_item["index"],
"text": content,
"annotation": jvvi_item.get("注释", ""),
"critique": jvvi_item.get("批判", ""),
"chapter": chapter,
}
)
# 一个章节一次
# 批量生成 embeddings(每个为 list[float])
# emb_result = llm.create_embedding(docs)
embeddings = all_embs[vhjx_index] # List[List[float]]
print(len(embeddings))
# 准备数据
milvus_data = []
for i, emb in enumerate(embeddings):
item = metas[i]
milvus_data.append(
{
"id": vhjx_index * 100 + i,
"index": item["index"],
"vector": emb,
"text": item["text"],
"annotation": item["annotation"],
"critique": item["critique"],
"chapter": item["chapter"],
}
)
print(f"✅ 共 {len(milvus_data)} 条数据")
# 插入数据
client.insert(collection_name="collection_1", data=milvus_data)
print(f"✅ 插入完成:共 {len(milvus_data)} 条数据")
def greet(name):
"""
Search for relevant critical commentary entries based on an input query from the Analects.
This function parses the input query, performs a fuzzy search in the indexed original text field,
and extracts related critiques.
Args:
query (str): The input text (a line from the Analects, possibly fuzzy or partial) to search.
Returns:
List[dict]: A list of result entries. Each entry contains the original hit and a list of related entries
under the key "extra", retrieved via index references mentioned in the commentary.
"""
# embeddings = llm.create_embedding(name)
completion = client_oai.embeddings.create(
model="text-embedding-v3",
input=name,
dimensions=1024, # 仅 text-embedding-v3 支持
encoding_format="float"
)
res = client.search(
collection_name="collection_1",
# data=[embeddings["data"][0]["embedding"]],
data=[completion.data[0].embedding],
limit=5,
output_fields=["index", "text", "annotation", "critique"],
)
return res
demo = gr.Interface(
fn=greet,
inputs=gr.Textbox(label="输入部分原文句子"),
outputs=gr.JSON(label="查询结果"),
title="论语批判MCP (Embedding版本)",
description="输入模糊的论语原文,可以向量检索到对应的批判内容。",
)
demo.launch(mcp_server=True)