Spaces:
Sleeping
Sleeping
import gradio as gr | |
# import llama_cpp | |
import base64 | |
from Crypto.Cipher import AES | |
from Crypto.Util.Padding import unpad | |
def decrypt_file(input_path, key): | |
# 读取加密文件 | |
with open(input_path, "rb") as f: | |
encrypted_data = base64.b64decode(f.read()) | |
key = key.ljust(32, "0")[:32].encode("utf-8") | |
iv = encrypted_data[:16] | |
ciphertext = encrypted_data[16:] | |
cipher = AES.new(key, AES.MODE_CBC, iv) | |
plaintext = unpad(cipher.decrypt(ciphertext), AES.block_size) | |
return plaintext.decode("utf-8") | |
# llm = llama_cpp.Llama.from_pretrained( | |
# repo_id="mradermacher/bge-large-zh-v1.5-GGUF", | |
# filename="bge-large-zh-v1.5.Q4_K_M.gguf", | |
# embedding=True, | |
# ) | |
# embedding_1 = llm.create_embedding("Hello, world!") | |
# embedding_2 = llm.create_embedding("你好, 世界!") # type(embedding_1['data'][0]['embedding']) list | |
from openai import OpenAI | |
import os | |
client_oai = OpenAI( | |
api_key=os.getenv("DASHSCOPE_API_KEY"), | |
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" # 百炼服务的base_url | |
) | |
from pymilvus import MilvusClient | |
client = MilvusClient("./books.db") | |
client.create_collection(collection_name="collection_1", dimension=1024) | |
import os, json | |
aeskey = os.getenv("aeskey") | |
decrypted_content = decrypt_file("encrypted.txt", aeskey) | |
raw_jsons = json.loads(decrypted_content) | |
with open("embeddings.json", mode="r") as embedding_file: | |
all_embs = json.load(embedding_file) | |
for vhjx_index, vhjx_item in enumerate(raw_jsons): | |
chapter = vhjx_item[0] | |
docs = [] | |
metas = [] | |
for jvvi_item in vhjx_item[1:]: | |
content = jvvi_item["原文"] | |
docs.append(content) | |
metas.append( | |
{ | |
"index": jvvi_item["index"], | |
"text": content, | |
"annotation": jvvi_item.get("注释", ""), | |
"critique": jvvi_item.get("批判", ""), | |
"chapter": chapter, | |
} | |
) | |
# 一个章节一次 | |
# 批量生成 embeddings(每个为 list[float]) | |
# emb_result = llm.create_embedding(docs) | |
embeddings = all_embs[vhjx_index] # List[List[float]] | |
print(len(embeddings)) | |
# 准备数据 | |
milvus_data = [] | |
for i, emb in enumerate(embeddings): | |
item = metas[i] | |
milvus_data.append( | |
{ | |
"id": vhjx_index * 100 + i, | |
"index": item["index"], | |
"vector": emb, | |
"text": item["text"], | |
"annotation": item["annotation"], | |
"critique": item["critique"], | |
"chapter": item["chapter"], | |
} | |
) | |
print(f"✅ 共 {len(milvus_data)} 条数据") | |
# 插入数据 | |
client.insert(collection_name="collection_1", data=milvus_data) | |
print(f"✅ 插入完成:共 {len(milvus_data)} 条数据") | |
def greet(name): | |
""" | |
Search for relevant critical commentary entries based on an input query from the Analects. | |
This function parses the input query, performs a fuzzy search in the indexed original text field, | |
and extracts related critiques. | |
Args: | |
query (str): The input text (a line from the Analects, possibly fuzzy or partial) to search. | |
Returns: | |
List[dict]: A list of result entries. Each entry contains the original hit and a list of related entries | |
under the key "extra", retrieved via index references mentioned in the commentary. | |
""" | |
# embeddings = llm.create_embedding(name) | |
completion = client_oai.embeddings.create( | |
model="text-embedding-v3", | |
input=name, | |
dimensions=1024, # 仅 text-embedding-v3 支持 | |
encoding_format="float" | |
) | |
res = client.search( | |
collection_name="collection_1", | |
# data=[embeddings["data"][0]["embedding"]], | |
data=[completion.data[0].embedding], | |
limit=5, | |
output_fields=["index", "text", "annotation", "critique"], | |
) | |
return res | |
demo = gr.Interface( | |
fn=greet, | |
inputs=gr.Textbox(label="输入部分原文句子"), | |
outputs=gr.JSON(label="查询结果"), | |
title="论语批判MCP (Embedding版本)", | |
description="输入模糊的论语原文,可以向量检索到对应的批判内容。", | |
) | |
demo.launch(mcp_server=True) | |