Spaces:
Sleeping
Sleeping
from whoosh.fields import TEXT, SchemaClass, ID | |
from jieba.analyse import ChineseAnalyzer | |
from whoosh.index import create_in | |
import json | |
analyzer = ChineseAnalyzer() | |
class ArticleSchema(SchemaClass): | |
index = ID(stored=True) | |
原文 = TEXT(stored=True, analyzer=analyzer) | |
注释 = TEXT(stored=True, analyzer=analyzer) | |
批判 = TEXT(stored=True, analyzer=analyzer) | |
章节 = TEXT(stored=True, analyzer=analyzer) | |
schema = ArticleSchema() | |
ix = create_in("indexdir", schema, indexname="article_index") | |
writer = ix.writer() | |
with open("反孔.json", encoding="utf-8") as json_file: | |
raw_jsons = json.load(json_file) | |
for vhjx_item in raw_jsons: | |
for jvvi_item in vhjx_item[1:]: | |
print(jvvi_item["index"]) | |
writer.add_document( | |
index=jvvi_item["index"], | |
原文=jvvi_item["原文"], | |
注释=jvvi_item["注释"] if "注释" in jvvi_item else "", | |
批判=jvvi_item["批判"] if "批判" in jvvi_item else "", | |
章节=vhjx_item[0], | |
) | |
writer.commit() | |
# init | |
import gradio as gr | |
from whoosh.qparser import QueryParser | |
from whoosh.index import open_dir | |
import re | |
from whoosh.query import Term | |
ix = open_dir("indexdir", indexname="article_index") | |
searcher = ix.searcher() | |
def search(query_info): | |
query = QueryParser("原文", ix.schema).parse(query_info) | |
results = searcher.search(query) | |
map_hit = [] | |
for hit in results: | |
批判文本 = hit.get("批判", "") | |
matches = re.findall(r"\d+[\·\.]\d+", 批判文本) | |
map_hit.append(dict(hit)) | |
map_hit[-1]["extra"] = [] | |
for index_ref in matches: | |
index_ref_normalized = index_ref.replace(".", "·") | |
term_query = Term("index", index_ref_normalized) | |
related_results = searcher.search(term_query) | |
for related_hit in related_results: | |
map_hit[-1]["extra"].append(dict(related_hit)) | |
return map_hit | |
def lunyu_search(query): | |
""" | |
Search for relevant critical commentary entries based on an input query from the Analects. | |
This function parses the input query, performs a fuzzy search in the indexed original text field, | |
and extracts related critiques. If any numeric index references (e.g., '3·2') are found in the | |
commentary, it will further retrieve related entries using these references. | |
Args: | |
query (str): The input text (a line from the Analects, possibly fuzzy or partial) to search. | |
Returns: | |
List[dict]: A list of result entries. Each entry contains the original hit and a list of related entries | |
under the key "extra", retrieved via index references mentioned in the commentary. | |
""" | |
return search(query_info=query) | |
demo = gr.Interface( | |
fn=lunyu_search, | |
inputs=gr.Textbox(label="输入部分原文句子"), | |
outputs=gr.JSON(label="查询结果"), | |
title="论语批判MCP", | |
description="输入模糊的论语原文,可以查询到对应的批判内容。", | |
examples=[ | |
["季氏旅于泰山。"], | |
["子曰:学而时习之,不亦说乎?"], | |
["有朋自远方来,不亦乐乎?"], | |
["三人行,必有我师焉。"], | |
], | |
) | |
if __name__ == "__main__": | |
res = search("季氏旅于泰山。") | |
print(res) | |
demo.launch(mcp_server=True) | |