Spaces:

shigureui
/

BookSearch

Sleeping

File size: 4,301 Bytes

57ecbdf
ddbc7f6
da335e9
 
 
 
65cd580
da335e9
 
65cd580
da335e9
65cd580
 
da335e9
 
 
 
 
65cd580
 
da335e9
57ecbdf
ddbc7f6
 
 
 
 
57ecbdf
8c639d1
 
ddbc7f6
c57522e
ddbc7f6
4b5811c
c57522e
ddbc7f6
 
 
da335e9
 
65cd580
da335e9
 
65cd580
da335e9
 
 
65cd580
 
 
d798d10
da335e9
65cd580
8c639d1
 
 
6cab0b1
da335e9
985d556
d798d10
 
da335e9
 
 
65cd580
 
 
 
 
 
 
 
 
 
6cab0b1
 
a4293d9
8c639d1
985d556
6cab0b1
 
 
 
65cd580
 
 
 
 
 
 
 
 
 
 
985d556
da335e9
6cab0b1
 
 
57ecbdf
65cd580
57ecbdf
ddbc7f6
 
 
 
 
 
 
 
 
 
 
 
 
 
4b5811c
ddbc7f6
 
 
 
 
da335e9
 
ddbc7f6
 
65cd580
 
da335e9
6cab0b1
57ecbdf
65cd580
 
 
 
 
 
 
 
57ecbdf

import gradio as gr
# import llama_cpp
import base64
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad


def decrypt_file(input_path, key):
    # 读取加密文件
    with open(input_path, "rb") as f:
        encrypted_data = base64.b64decode(f.read())

    key = key.ljust(32, "0")[:32].encode("utf-8")
    iv = encrypted_data[:16]
    ciphertext = encrypted_data[16:]

    cipher = AES.new(key, AES.MODE_CBC, iv)
    plaintext = unpad(cipher.decrypt(ciphertext), AES.block_size)

    return plaintext.decode("utf-8")


# llm = llama_cpp.Llama.from_pretrained(
#     repo_id="mradermacher/bge-large-zh-v1.5-GGUF",
#     filename="bge-large-zh-v1.5.Q4_K_M.gguf",
#     embedding=True,
# )

# embedding_1 = llm.create_embedding("Hello, world!")
# embedding_2 = llm.create_embedding("你好, 世界!") # type(embedding_1['data'][0]['embedding']) list
from openai import OpenAI
import os

client_oai = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"), 
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"  # 百炼服务的base_url
)


from pymilvus import MilvusClient

client = MilvusClient("./books.db")

client.create_collection(collection_name="collection_1", dimension=1024)

import os, json

aeskey = os.getenv("aeskey")
decrypted_content = decrypt_file("encrypted.txt", aeskey)
raw_jsons = json.loads(decrypted_content)


with open("embeddings.json", mode="r") as embedding_file:
    all_embs = json.load(embedding_file)


for vhjx_index, vhjx_item in enumerate(raw_jsons):
    chapter = vhjx_item[0]

    docs = []
    metas = []
    for jvvi_item in vhjx_item[1:]:
        content = jvvi_item["原文"]
        docs.append(content)
        metas.append(
            {
                "index": jvvi_item["index"],
                "text": content,
                "annotation": jvvi_item.get("注释", ""),
                "critique": jvvi_item.get("批判", ""),
                "chapter": chapter,
            }
        )

    # 一个章节一次
    # 批量生成 embeddings（每个为 list[float]）
    # emb_result = llm.create_embedding(docs)
    embeddings = all_embs[vhjx_index]  # List[List[float]]
    print(len(embeddings))
    # 准备数据
    milvus_data = []
    for i, emb in enumerate(embeddings):
        item = metas[i]
        milvus_data.append(
            {
                "id": vhjx_index * 100 + i,
                "index": item["index"],
                "vector": emb,
                "text": item["text"],
                "annotation": item["annotation"],
                "critique": item["critique"],
                "chapter": item["chapter"],
            }
        )
    print(f"✅ 共 {len(milvus_data)} 条数据")

    # 插入数据
    client.insert(collection_name="collection_1", data=milvus_data)
    print(f"✅ 插入完成：共 {len(milvus_data)} 条数据")


def greet(name):
    """
    Search for relevant critical commentary entries based on an input query from the Analects.

    This function parses the input query, performs a fuzzy search in the indexed original text field,
    and extracts related critiques. 

    Args:
        query (str): The input text (a line from the Analects, possibly fuzzy or partial) to search.

    Returns:
        List[dict]: A list of result entries. Each entry contains the original hit and a list of related entries
                    under the key "extra", retrieved via index references mentioned in the commentary.
    """
    # embeddings = llm.create_embedding(name)
    completion = client_oai.embeddings.create(
            model="text-embedding-v3",
            input=name,
            dimensions=1024,  # 仅 text-embedding-v3 支持
            encoding_format="float"
        )
    res = client.search(
        collection_name="collection_1",
        # data=[embeddings["data"][0]["embedding"]],
        data=[completion.data[0].embedding],
        limit=5,
        output_fields=["index", "text", "annotation", "critique"],
    )
    return res


demo = gr.Interface(
    fn=greet,
    inputs=gr.Textbox(label="输入部分原文句子"),
    outputs=gr.JSON(label="查询结果"),
    title="论语批判MCP (Embedding版本)",
    description="输入模糊的论语原文，可以向量检索到对应的批判内容。",
)
demo.launch(mcp_server=True)