Spaces:

svjack
/

Harry-Potter-Knowledge-Question-Answer-in-Chinese

Runtime error

App Files Files Community

svjack commited on Dec 4, 2022

Commit

03c1af1

1 Parent(s): b2f024b

Upload 5 files

Browse files

Files changed (5) hide show

extract_by_api.py +17 -0
extract_et_by_api.py +14 -0
qa.py +138 -0
translate_by_api.py +14 -0
wiki_kb_qa_migrate.py +1012 -0

extract_by_api.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import requests
+def call_en_zh_reader(English_Span, Chinese_Sentence):
+    assert type(English_Span) == type("")
+    assert type(Chinese_Sentence) == type("")
+    response = requests.post("https://svjack-extract-similar-chinese-span-by--5daeb83.hf.space/run/predict", json={
+      "data": [
+        English_Span,
+        Chinese_Sentence,
+    ]}).json()
+    data = response["data"]
+    if data:
+        data = data[0]
+        pass
+    else:
+        pass
+    return data

extract_et_by_api.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import requests
+def call_entity_property_extract(zh_question):
+    response = requests.post("https://svjack-entity-property-extractor-zh.hf.space/run/predict", json={
+  "data": [
+    zh_question,
+    ]}).json()
+    data = response["data"]
+    if data:
+        data = data[0]
+        pass
+    else:
+        pass
+    return data

qa.py ADDED Viewed

	@@ -0,0 +1,138 @@

+#from conf import *
+import os
+import sys
+import re
+from rapidfuzz import fuzz
+import requests
+import json
+#assert os.path.exists(flair_ner_model_path)
+#loaded_model: SequenceTagger = SequenceTagger.load(os.path.join(flair_ner_model_path ,"best-model.pt"))
+'''
+def one_item_process(r, loaded_model):
+    #assert type(r) == type(pd.Series())
+    zh = r["question"]
+    zh = zh.replace(" ", "").strip()
+    sentence = Sentence(" ".join(list(zh)))
+    loaded_model.predict(sentence)
+    sentence_str = str(sentence)
+    ask_spans = re.findall(r'\["(.+?)"/ASK\]', sentence_str)
+    sentence = re.findall(r'Sentence: "(.+?)"', sentence_str)
+    if ask_spans:
+        ask_spans = ask_spans[0]
+    else:
+        ask_spans = ""
+    if sentence:
+        sentence = sentence[0]
+    else:
+        sentence = ""
+    ask_spans, sentence = map(lambda x: x.replace(" ", "").strip(), [ask_spans, sentence])
+    return ask_spans, sentence
+'''
+def one_item_process_by_request(r):
+    zh = r["question"]
+    zh = zh.replace(" ", "").strip()
+    response = requests.post("https://svjack-question-words-extractor-zh.hf.space/run/predict", json={
+      "data": [
+        zh,
+    ]}).json()
+    data = response["data"]
+    #data = json.loads(data)
+    if data:
+        data = data[0]
+        Question_words = data["Question words"]
+    else:
+        Question_words = ""
+    return Question_words, zh
+def retrieve_sent_split(sent,
+                       stops_split_pattern = "|".join(map(lambda x: r"\{}".format(x),
+                                                                 ",." + "，。" + ":？? "))
+                       ):
+    if not sent.strip():
+        return []
+    split_list = re.split(stops_split_pattern, sent)
+    return split_list
+def find_min_text_contain_entity_span(sent, entity_str, statement):
+    #assert entity_str in sent
+    span_list = list(filter(lambda x: entity_str in x ,retrieve_sent_split(sent)))
+    if not span_list:
+        return sent
+    span_list = list(map(lambda x: (x, fuzz.ratio(x, statement)), span_list))
+    return sorted(span_list, key = lambda t2: t2[1], reverse = True)[0][0]
+    #return sorted(span_list, key = len)[0]
+def to_statement(r):
+    entity = r["entity"]
+    question = r["question"]
+    head = r["head"]
+    context = r["context"]
+    statement = question.replace(head, entity).replace("?", "").replace("？", "")
+    top_chip = find_min_text_contain_entity_span(context, entity, statement)
+    return statement, top_chip
+'''
+r = {'entity': '1901年',
+ 'question': '荷兰国会何时通过伦理政策?',
+ 'title': '爪哇岛',
+ 'context': '伊斯兰教被接受的同时，其教义也被融入了当地人长久以来的一些信仰，所以爪哇岛的伊斯兰教带有明显的本地特色  “荷兰东印度公司”在巴达维亚（今天的雅加达）建立了“贸易和行政管理总部” 在殖民统治时期，荷兰人将注意力集中在雅加达和其他一些海滨城市，例如三宝垄和泗水 荷兰殖民者还通过一些归顺的本土势力，间接对这个多山的岛屿进行统治，例如爪哇岛中部的马打兰王国  19世纪，荷兰政府从荷兰东印度公司手上接管了东印度群岛，1830年荷兰统治者开始实行所谓“耕种制”（荷兰语cultuurstelsel en cultuurprocenten）的变相奴役制度，导致了大范围的饥荒和贫困 随即发生了各种政治和社会反抗运动，其中一位名叫Multatuli的荷兰作家写了一本名叫《Max Havelaar》的小说，以抗议当时的社会状况 迫于各种反抗运动此起彼伏，1901年荷兰国会通过伦理政策（Etnisch beleid），客观上使一部分爪哇人接触到荷兰式教育，在这些人中，出现了很多杰出的印尼民族主义者，并且在二战后的印尼独立运动中起到了重要作用'}
+qa_downstream_process(
+    r["entity"],
+    r["question"],
+    r["context"],
+    loaded_model
+)
+{'entity': '1901年',
+ 'question': '荷兰国会何时通过伦理政策?',
+ 'context': '伊斯兰教被接受的同时，其教义也被融入了当地人长久以来的一些信仰，所以爪哇岛的伊斯兰教带有明显的本地特色  “荷兰东印度公司”在巴达维亚（今天的雅加达）建立了“贸易和行政管理总部” 在殖民统治时期，荷兰人将注意力集中在雅加达和其他一些海滨城市，例如三宝垄和泗水 荷兰殖民者还通过一些归顺的本土势力，间接对这个多山的岛屿进行统治，例如爪哇岛中部的马打兰王国  19世纪，荷兰政府从荷兰东印度公司手上接管了东印度群岛，1830年荷兰统治者开始实行所谓“耕种制”（荷兰语cultuurstelsel en cultuurprocenten）的变相奴役制度，导致了大范围的饥荒和贫困 随即发生了各种政治和社会反抗运动，其中一位名叫Multatuli的荷兰作家写了一本名叫《Max Havelaar》的小说，以抗议当时的社会状况 迫于各种反抗运动此起彼伏，1901年荷兰国会通过伦理政策（Etnisch beleid），客观上使一部分爪哇人接触到荷兰式教育，在这些人中，出现了很多杰出的印尼民族主义者，并且在二战后的印尼独立运动中起到了重要作用',
+ 'head': '何时',
+ 'statement': '荷兰国会1901年通过伦理政策',
+ 'top_chip': '1901年荷兰国会通过伦理政策（Etnisch'}
+'''
+#def qa_downstream_process(entity, question, context, loaded_model = loaded_model):
+def qa_downstream_process(entity, question, context):
+    if entity not in context:
+        return None
+    d = {
+        "entity": entity,
+        "question": question,
+        "context": context
+    }
+    #head_qst = one_item_process(d, loaded_model)
+    head_qst = one_item_process_by_request(d)
+    head, _ = head_qst
+    d["head"] = head
+    statement, top_chip = to_statement(d)
+    d["statement"] = statement
+    d["top_chip"] = top_chip
+    return d
+'''
+@csrf_exempt
+def qa_downstream_process_part(request):
+    assert request.method == "POST"
+    post_data = request.POST
+    entity = post_data["entity"]
+    question = post_data["question"]
+    context = post_data["context"]
+    output = qa_downstream_process(entity, question, context)
+    if output is None:
+        return HttpResponse(json.dumps(
+            {"output": "No Answer"}
+        ))
+    assert type(output) == type({})
+    req_str = json.dumps(output)
+    return HttpResponse(
+        req_str
+    )
+'''
+if __name__ == "__main__":
+    pass

translate_by_api.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import requests
+def call_zh_en_naive_model(zh_question):
+    response = requests.post("https://svjack-translate-chinese-to-english.hf.space/run/predict", json={
+  "data": [
+    zh_question,
+    ]}).json()
+    data = response["data"]
+    if data:
+        data = data[0]
+        English_Question = data["English Question"]
+    else:
+        English_Question = ""
+    return English_Question

wiki_kb_qa_migrate.py ADDED Viewed

	@@ -0,0 +1,1012 @@

+#### qa_env
+#from conf import *
+from qa import *
+from translate_by_api import *
+from extract_by_api import *
+from extract_et_by_api import *
+import os
+import logging
+import subprocess
+import time
+from pathlib import Path
+from haystack.nodes import Text2SparqlRetriever
+from haystack.document_stores import GraphDBKnowledgeGraph, InMemoryKnowledgeGraph
+#from haystack.utils import fetch_archive_from_http
+import pandas as pd
+import numpy as np
+import os
+import sys
+#import jieba
+from functools import partial, reduce, lru_cache
+#from easynmt import EasyNMT
+#from sentence_transformers.util import pytorch_cos_sim
+#from sentence_transformers import SentenceTransformer
+from time import time
+from itertools import product
+#import pickle as pkl
+from urllib.parse import unquote
+import requests
+import json
+import pandas as pd
+import numpy as np
+import os
+import sys
+#import jieba
+from functools import partial, reduce, lru_cache
+#from easynmt import EasyNMT
+#from sentence_transformers.util import pytorch_cos_sim
+#from sentence_transformers import SentenceTransformer
+from time import time
+from itertools import product
+#import pickle as pkl
+#import faiss
+from rapidfuzz import fuzz
+import synonyms
+import sys
+#sys.path.insert(0 ,"/Users/svjack/temp/HP_kbqa/script")
+#from trans_toolkit import *
+#from easynmt import EasyNMT
+#zh_en_naive_model = EasyNMT("m2m_100_418M")
+'''
+p00 = os.path.join(model_path, "zh_en_m2m")
+assert os.path.exists(p00)
+zh_en_naive_model = EasyNMT(p00)
+zh_en_naive_model.translate(["宁波在哪?"], source_lang="zh", target_lang = "en")
+'''
+'''
+from haystack.nodes import FARMReader
+#question_reader_save_path = "/Users/svjack/temp/model/en_zh_question_reader_save_epc_2_spo"
+question_reader_save_path = os.path.join(model_path, "en_zh_question_reader_save_epc_2_spo")
+assert os.path.exists(question_reader_save_path)
+en_zh_reader = FARMReader(model_name_or_path=question_reader_save_path, use_gpu=False,
+    num_processes = 0
+)
+'''
+kg = InMemoryKnowledgeGraph(index="tutorial_10_index")
+kg.delete_index()
+kg.create_index()
+kg.import_from_ttl_file(index="tutorial_10_index", path=Path("data") / "triples.ttl")
+#kg.get_params()
+#all_triples = kg.get_all_triples()
+#spo_df = pd.DataFrame(all_triples)
+#### some collection in kb_aug
+import re
+def transform_namespace_to_prefix_str(g):
+    namespaces = g.namespaces()
+    return "\n".join(map(lambda x: "PREFIX {}: <{}>".format(x[0], x[1]), namespaces))
+#print(transform_namespace_to_prefix_str(kg.indexes["tutorial_10_index"]))
+### ->
+wiki_prefix = '''
+PREFIX brick: <https://brickschema.org/schema/Brick#>
+PREFIX csvw: <http://www.w3.org/ns/csvw#>
+PREFIX dc: <http://purl.org/dc/elements/1.1/>
+PREFIX dcat: <http://www.w3.org/ns/dcat#>
+PREFIX dcmitype: <http://purl.org/dc/dcmitype/>
+PREFIX dcterms: <http://purl.org/dc/terms/>
+PREFIX dcam: <http://purl.org/dc/dcam/>
+PREFIX doap: <http://usefulinc.com/ns/doap#>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+PREFIX odrl: <http://www.w3.org/ns/odrl/2/>
+PREFIX org: <http://www.w3.org/ns/org#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX prof: <http://www.w3.org/ns/dx/prof/>
+PREFIX prov: <http://www.w3.org/ns/prov#>
+PREFIX qb: <http://purl.org/linked-data/cube#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX schema: <https://schema.org/>
+PREFIX sh: <http://www.w3.org/ns/shacl#>
+PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+PREFIX sosa: <http://www.w3.org/ns/sosa/>
+PREFIX ssn: <http://www.w3.org/ns/ssn/>
+PREFIX time: <http://www.w3.org/2006/time#>
+PREFIX vann: <http://purl.org/vocab/vann/>
+PREFIX void: <http://rdfs.org/ns/void#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX xml: <http://www.w3.org/XML/1998/namespace>
+PREFIX hp: <https://deepset.ai/harry_potter/>
+'''
+prefix_s = pd.Series(wiki_prefix.split("\n")).map(
+    lambda x: x if x.startswith("PREFIX") else np.nan
+).dropna().map(
+    lambda x: re.findall("PREFIX (.*): <", x)
+).map(lambda x: x[0])
+prefix_url_dict = dict(map(
+    lambda y: (y.split(" ")[1].replace(":", ""), y.split(" ")[2].strip()[1:-1])
+    ,filter(
+    lambda x: x.strip()
+    , wiki_prefix.split("\n"))))
+url_prefix_dict = dict(map(lambda t2: t2[::-1], prefix_url_dict.items()))
+all_triples = kg.get_all_triples()
+spo_df = pd.DataFrame(all_triples)
+spo_df_simple = spo_df.copy()
+spo_df_simple = spo_df_simple.applymap(lambda x: x["value"]).applymap(lambda x:
+                                                  (list(filter(lambda t2: x.startswith(t2[0]), url_prefix_dict.items()))[0], x) if any(map(lambda t2: x.startswith(t2[0]), url_prefix_dict.items())) else (None, x)
+                                                   ).applymap(
+    lambda t2: t2[1].replace(t2[0][0], "{}:".format(t2[0][1])) if t2[0] is not None else t2[1]
+).applymap(unquote)
+'''
+#### like property in wikidata
+spo_df_simple["p"].map(
+    lambda x: x[3:] if x.startswith("hp:") else np.nan
+).dropna().value_counts()
+#### others in p col (rdf:type)
+spo_df_simple["p"].map(
+    lambda x: x if not x.startswith("hp:") else np.nan
+).dropna().value_counts()
+#### groupby different entity type view
+pd.concat(
+list(map(
+    lambda t2: t2[1].head(2),
+list(spo_df_simple[
+    spo_df_simple["p"] == "rdf:type"
+].sort_values(by = ["o", "s"]).groupby("o"))
+)), axis = 0).head(30)
+'''
+#### spo s(type)o
+#### use deepl translate to lookup
+#spo_trans_total_df = pd.read_csv("../data/spo_trans_total.csv")
+spo_trans_total_df = pd.read_csv("data/spo_trans_total.csv")
+spo_trans_dict = dict(spo_trans_total_df.values.tolist())
+'''
+with open("../data/spo_trans_dict.json", "w") as f:
+    json.dump(spo_trans_dict, f)
+'''
+spo_trans_back_dict = dict(map(lambda t2: t2[::-1], spo_trans_dict.items()))
+spo_df_simple_keyed = spo_df_simple.copy()
+def map_to_trans_key(src):
+    x = str(src)
+    if not x.startswith("hp:"):
+        return np.nan
+    return x[3:].replace('"', '').replace("'", '').replace("_", " ")
+spo_df_simple_trans = spo_df_simple_keyed.applymap(
+    lambda x: (x ,map_to_trans_key(x))
+).applymap(
+    lambda t2: spo_trans_dict.get(t2[1], t2[0]) if type(t2[1]) == type("") else t2[0]
+)
+'''
+pd.concat(
+list(map(
+    lambda t2: t2[1].head(2),
+list(spo_df_simple_trans[
+    spo_df_simple_trans["p"] == "rdf:type"
+].sort_values(by = ["o", "s"]).groupby("o"))
+)), axis = 0).head(50)
+spo_df_simple_trans[
+    spo_df_simple_trans["s"] == "斯蒂芬-康福特"
+]
+'''
+model_dir = "data/"
+kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")
+def decode_query(eng_query ,kgqa_retriever, top_k = 3):
+    self = kgqa_retriever
+    inputs = self.tok([eng_query], max_length=100, truncation=True, return_tensors="pt")
+    # generate top_k+2 SPARQL queries so that we can dismiss some queries with wrong syntax
+    temp = self.model.generate(
+            inputs["input_ids"], num_beams=max(5, top_k + 2), max_length=100, num_return_sequences=top_k + 2, early_stopping=True
+        )
+    sparql_queries = [
+            self.tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in temp
+        ]
+    return sparql_queries
+import re
+from uuid import uuid1
+import jionlp as jio
+special_match_token_list = [
+        " filter(",
+]
+def fill_bk(str_):
+    #assert str_[0] == "("
+    req = []
+    cnt = 0
+    have_match_one = False
+    for char in str_:
+        #print(req)
+        if char == "(":
+            cnt += 1
+            have_match_one = True
+        if char == ")":
+            cnt -= 1
+        req.append(char)
+        if cnt == 0 and have_match_one:
+            break
+    return "".join(req)
+def match_special_token(query, special_match_token_list):
+    assert type(query) == type("")
+    assert type(special_match_token_list) == type([])
+    special_match_token_list_ = list(filter(lambda x: x in query, special_match_token_list))
+    if not special_match_token_list_:
+        return []
+    return list(map(lambda x: (x ,
+    fill_bk(
+    query[query.find(x):]
+    )
+    ), special_match_token_list_))
+def retrieve_sent_split(sent,
+                       stops_split_pattern = "|".join(map(lambda x: r"\{}".format(x),
+                                                                 " "))
+                       ):
+    if not sent.strip():
+        return []
+    split_list = re.split(stops_split_pattern, sent)
+    return split_list
+import jionlp as jio
+ask_l = [
+        "?answer", "?value", "?obj", "?sbj", "?s", "?x",  "?a"
+    ]
+ask_ner = jio.ner.LexiconNER({
+    "ask": ask_l
+})
+def query_to_t3(query, filter_list = [], ask_ner = ask_ner):
+    '''
+    query = query.replace("?answer", " ?answer ")
+    query = query.replace("?value", " ?value ")
+    query = query.replace("?obj", " ?obj ")
+    query = query.replace("?sbj", " ?sbj ")
+    query = query.replace("?s", " ?s ")
+    query = query.replace("?x", " ?x ")
+    '''
+    l = ask_ner(query)
+    l = sorted(set(map(lambda x: x["text"], l)), key = len, reverse = True)
+    for k in l:
+        query = query.replace(k, " {} ".format(k))
+    '''
+    if "where" not in query and "WHERE" not in query:
+        return []
+    '''
+    special_token_list = match_special_token(query, special_match_token_list)
+    #return special_token_list
+    if special_token_list:
+        special_token_list = list(set(map(lambda t2: t2[1] ,special_token_list)))
+        uid_special_token_dict = dict(map(lambda x: (str(uuid1()), x), special_token_list))
+        special_token_uid_dict = dict(map(lambda t2: t2[::-1], uid_special_token_dict.items()))
+        assert len(special_token_uid_dict) == len(uid_special_token_dict)
+        for k, v in sorted(special_token_uid_dict.items(), key = lambda t2: len(t2[0]), reverse = True):
+            if k in query:
+                #query = query.replace(k, v)
+                query = query.replace(k, "")
+    else:
+        uid_special_token_dict = {}
+        special_token_uid_dict = {}
+    '''
+    if "where" in query:
+        tail = "where".join(query.split("where")[1:])
+    elif "WHERE" in query:
+        tail = "WHERE".join(query.split("WHERE")[1:])
+    '''
+    #return query
+    query = query.strip()
+    if not query.endswith("}"):
+        query = query + "}"
+    tail = re.findall(r"{(.*)}", query)
+    #return tail
+    #return t3_list
+    if not tail:
+        return []
+    else:
+        tail = tail[0]
+    t3_list = list(map(lambda x: x.strip() ,tail.split(".")))
+    t3_list_ = []
+    for ele in t3_list:
+        for k, v in uid_special_token_dict.items():
+            if k in ele:
+                ele = ele.replace(k, v)
+        t3_list_.append(ele)
+    t3_list = t3_list_
+    if filter_list:
+        t3_list = list(filter(lambda x:
+                         any(map(lambda y: y in x ,filter_list))
+                         , t3_list))
+    t3_list = list(map(lambda x:
+        list(filter(lambda y: y.strip() ,retrieve_sent_split(x)))
+        , t3_list))
+    return t3_list
+def decode_property(eng_query ,kgqa_retriever, top_k = 3):
+    sparql_queries = decode_query(eng_query,  kgqa_retriever, top_k = top_k)
+    if not sparql_queries:
+        return []
+    t3_nest_list = list(map(lambda x: query_to_t3(x), sparql_queries))
+    ####return t3_nest_list
+    p_nest_list = []
+    for ele in t3_nest_list:
+        for e in ele:
+            if len(e) == 3:
+                p_nest_list.append(e)
+    #p_nest_list = list(filter(lambda x: len(x) == 3, t3_nest_list))
+    if not p_nest_list:
+        return []
+    p_nest_list = list(map(lambda x: x[1], p_nest_list))
+    return p_nest_list
+'''
+#### ori query decoder
+query = "Harry Potter live in which house?"
+query = "when was Stephen cornfoot born?"
+decode_query(query,  kgqa_retriever)
+#### ori query decoder only maintain property part
+query = "Harry Potter live in which house in 1920?"
+query = "Harry live in where?"
+query = "Harry live in where?"
+query = "when was Stephen cornfoot born?"
+query = "what is Stephen's loyalty?"
+decode_property(query,  kgqa_retriever)
+query = "who is the leader of Divination homework meeting?"
+'''
+def template_fullfill_reconstruct_query(entity_list = ["http://www.wikidata.org/entity/Q42780"]
+                                        , property_list = ["http://www.wikidata.org/prop/direct/P131",
+                    "http://www.wikidata.org/prop/direct/P150"
+    ],
+        generate_t3_func = lambda el, pl: pd.Series(list(product(el, pl))).map(
+        lambda ep: [(ep[0], ep[1], "?a"), ("?a", ep[1], ep[0])]
+    ).explode().dropna().drop_duplicates().tolist()
+                                       ):
+    assert type(entity_list) == type([])
+    assert type(property_list) == type([])
+    if not entity_list or not property_list:
+        return []
+    query_list = list(map(list ,generate_t3_func(entity_list, property_list)))
+    if not query_list:
+        return []
+    req = list(map(lambda x:  "select ?a {" + " ".join(x) + "}", query_list))
+    return req
+'''
+sparql_queries_reconstruct = template_fullfill_reconstruct_query(
+    ["hp:Divination_homework_meeting"],
+    ["hp:leader"]
+)
+sparql_queries_reconstruct
+'''
+def run_sparql_queries(sparql_queries, kgqa_retriever, top_k = 3):
+    self = kgqa_retriever
+    answers = []
+    for sparql_query in sparql_queries:
+        ans, query = self._query_kg(sparql_query=sparql_query)
+        if len(ans) > 0:
+            answers.append((ans, query))
+    # if there are no answers we still want to return something
+    if len(answers) == 0:
+        answers.append(("", ""))
+    results = answers[:top_k]
+    results = [self.format_result(result) for result in results]
+    return results
+'''
+#### one conclusion
+run_sparql_queries(sparql_queries_reconstruct, kgqa_retriever)
+'''
+#### start kbqa_protable_service (server)
+def retrieve_et(zh_question, only_e = True):
+    assert type(zh_question) == type("")
+    '''
+    qst = zh_question
+    rep = requests.post(
+        url = "http://localhost:8855/extract_et",
+        data = {
+            "question":  qst
+        }
+    )
+    output = json.loads(rep.content.decode())
+    '''
+    output = call_entity_property_extract(zh_question)
+    if only_e:
+        return output.get("E-TAG", [])
+    return output
+'''
+#### start qa server
+def retrieve_head(zh_question):
+    req = requests.post(
+    url = "http://localhost:8811/qa_downstream_process",
+    data = {
+        "entity": "",
+        "question": zh_question,
+        "context": zh_question
+    }
+    )
+    output = json.loads(req.content.decode())
+    if "head" in output:
+        return output["head"]
+    return ""
+'''
+def retrieve_head(zh_question):
+    output = qa_downstream_process(
+        "", zh_question, zh_question
+    )
+    assert type(output) == type({})
+    if "head" in output:
+        return output["head"]
+    return ""
+'''
+zh_question = "谁是占卜会议的领导者？"
+retrieve_et(zh_question)
+'''
+def property_and_type_slice(spo_df_simple_trans, p_l = [], type_l = []):
+    req = spo_df_simple_trans.copy()
+    if type_l:
+        s_l = req[
+        req["o"].isin(type_l)
+    ]["s"].drop_duplicates().dropna().values.tolist()
+        req = req[
+            req["s"].isin(s_l)
+        ]
+    if req.size == 0:
+        return None
+    if p_l:
+        s_l = req[
+        req["p"].isin(p_l)
+    ]["s"].drop_duplicates().dropna().values.tolist()
+        req = req[
+            req["s"].isin(s_l)
+        ]
+    if req.size == 0:
+        return None
+    return req
+'''
+### Organisation_ sanple
+property_and_type_slice(
+    spo_df_simple_trans, p_l = ["创立"], type_l = ["hp:Organisation_"]
+).sort_values(by = "s")["s"].drop_duplicates().sample(n = 30)
+### people sample
+property_and_type_slice(
+    spo_df_simple_trans, p_l = ["出生"], type_l = ["hp:Individual_"]
+).sort_values(by = "s")["s"].drop_duplicates().sample(n = 30)
+zh_question = "谁是占卜会议的领导者？"
+en_question = zh_en_naive_model.translate([zh_question], source_lang="zh", target_lang = "en")[0]
+en_properties = decode_property(en_question,  kgqa_retriever)
+en_properties
+'''
+all_en_p = spo_df_simple["p"].drop_duplicates().dropna().values.tolist()
+all_en_p_tokens = pd.Series(list(map(lambda x: x[3:].split("_") ,filter(lambda x:  x.startswith("hp:"), all_en_p)))).explode().dropna().map(
+    lambda x: x if bool(x) else np.nan
+).dropna().drop_duplicates().values.tolist()
+###all_en_p_tokens[:10]
+all_p_df = pd.Series(all_en_p).reset_index().iloc[:, 1:]
+all_p_df.columns = ["en_p"]
+all_p_df = all_p_df[
+    all_p_df["en_p"] != "rdf:type"
+]
+all_p_df["zh_p"] = all_p_df["en_p"].map(
+    lambda x: spo_trans_dict.get(x.replace("hp:", "").replace("_", " "), x.replace("hp:", "").replace("_", " "))
+)
+#all_p_df
+#### decoder property mapping: (map decoder to kb exists)
+decode_map_config_dict = {
+    "hp:birth":  'hp:born',
+    'hp:birthday': "hp:born"
+}
+#### decoder sim property mapping: (decoder that can not distinguish)
+decode_sim_config_dict = {
+    'hp:ingredients':  "hp:characteristics",
+    "hp:characteristics": 'hp:ingredients'
+}
+def decode_property_link_to_ori(decode_property,  all_en_p, all_en_p_tokens, equal_threshold = 80):
+    if not decode_property.startswith("hp:") or not len(decode_property) >= 3:
+        return None
+    if decode_property in all_en_p:
+        return [(decode_property, 100.0)]
+    if decode_property in decode_map_config_dict:
+        return [(decode_map_config_dict[decode_property], 99.0)]
+    def filter_by_p_tokens(decode_property):
+        req = []
+        for ele in decode_property[3:].split("_"):
+            if ele in all_en_p_tokens:
+                req.append(ele)
+        return "hp:{}".format("_".join(req))
+    if decode_property == "hp:":
+        return None
+    decode_property = filter_by_p_tokens(decode_property)
+    order_list = sorted(map(lambda x: (x, fuzz.ratio(x, decode_property)), all_en_p), key = lambda t2: t2[1], reverse = True)
+    return order_list[:10]
+'''
+#### minimize maintain one token sorted.
+decode_property_link_to_ori("hp:born",  all_en_p, all_en_p_tokens, equal_threshold = 80)
+decode_property_link_to_ori("hp:birth",  all_en_p, all_en_p_tokens, equal_threshold = 80)
+decode_property_link_to_ori("hp:head_of_the_assembly",  all_en_p, all_en_p_tokens, equal_threshold = 80)
+'''
+def output_to_dict(output, trans_keys = ["answers"]):
+    non_trans_t2_list = list(filter(lambda t2: t2[0] not in trans_keys, output.items()))
+    trans_t2_list = list(map(lambda tt2: (
+        tt2[0],
+       list(map(lambda x: x.to_dict(), tt2[1]))
+    ) ,filter(lambda t2: t2[0] in trans_keys, output.items())))
+    #return trans_t2_list
+    return dict(trans_t2_list + non_trans_t2_list)
+def zh_question_to_p_zh_en_map(zh_question, top_k = 3):
+    #zh_question = "谁是占卜会议的领导者？"
+    #en_question = zh_en_naive_model.translate([zh_question], source_lang="zh", target_lang = "en")[0]
+    en_question = call_zh_en_naive_model(zh_question)
+    en_properties = decode_property(en_question,  kgqa_retriever, top_k = top_k)
+    if not en_properties:
+        return None
+    en_properties_top_sort = pd.Series(en_properties).value_counts().index.tolist()
+    en_properties_mapped = list(map(
+    lambda x: decode_property_link_to_ori(x,  all_en_p, all_en_p_tokens, equal_threshold = 80), en_properties_top_sort
+    ))
+    en_properties_mapped = list(filter(lambda x: hasattr(x, "__len__") and len(x) >= 1, en_properties_mapped))
+    if not en_properties_mapped:
+        return None
+    en_properties_mapped = list(map(lambda x: x[0] ,en_properties_mapped))
+    en_properties_mapped_df = pd.DataFrame(en_properties_mapped)
+    assert en_properties_mapped_df.shape[1] == 2
+    en_properties_mapped_df.columns = ["en_property", "score"]
+    '''
+    en_properties_mapped_df["zh_property"] = en_properties_mapped_df["en_property"].map(
+        lambda x: en_zh_reader.predict_on_texts(
+        question=x.replace("hp:", ""),
+        texts=[zh_question]
+    )
+    ).map(output_to_dict)
+    '''
+    en_properties_mapped_df["zh_property"] = en_properties_mapped_df["en_property"].map(
+        lambda x: call_en_zh_reader(
+        x.replace("hp:", ""),
+        zh_question
+        )
+    )
+    en_properties_mapped_df["zh_property"] = en_properties_mapped_df["zh_property"].map(lambda x: x["answers"][0] if  x["answers"] else {})
+    en_properties_mapped_df = en_properties_mapped_df[
+    en_properties_mapped_df["zh_property"].map(bool)
+    ]
+    if en_properties_mapped_df is None or en_properties_mapped_df.size == 0:
+        return None
+    #return nerd_df
+    en_properties_mapped_df["ext_score"] = en_properties_mapped_df["zh_property"].map(
+    lambda x: x["score"]
+    )
+    en_properties_mapped_df["zh_property"] = en_properties_mapped_df["zh_property"].map(
+    lambda x: x["answer"]
+    )
+    '''
+    en_properties_mapped_df = en_properties_mapped_df[
+    en_properties_mapped_df["ext_score"].map(lambda x: x > score_threshold)
+    ]
+    '''
+    if en_properties_mapped_df is None or en_properties_mapped_df.size == 0:
+        return None
+    ask_head = retrieve_head(zh_question)
+    #if type(ask_head) == type("") and "什么" in ask_head:
+    if type(ask_head) == type(""):
+        #ask_head = ask_head.replace("什么", "")
+        first_d = en_properties_mapped_df.iloc[0].to_dict()
+        first_d["zh_property"] = ask_head
+        en_properties_mapped_df = pd.DataFrame(
+        [first_d] + en_properties_mapped_df.apply(lambda x: x.to_dict(), axis = 1).values.tolist()
+        )
+    else:
+        pass
+    en_properties_mapped_df = en_properties_mapped_df[
+    en_properties_mapped_df["zh_property"].map(lambda x: bool(x))
+    ].drop_duplicates()
+    return en_properties_mapped_df
+def search_sym_p(question_p_df,  all_p_df):
+    #zh_p_l = question_p_df["zh_property"].drop_duplicates().values.tolist()
+    #en_p_l = question_p_df["en_property"].drop_duplicates().values.tolist()
+    req = []
+    for idx, r in question_p_df.iterrows():
+        all_p_score_df =  all_p_df.copy()
+        all_p_score_df["zh_property"] = [r["zh_property"]] * len(all_p_score_df)
+        all_p_score_df["en_property"] = [r["en_property"]] * len(all_p_score_df)
+        req.append(all_p_score_df)
+    req = pd.concat(req, axis = 0)
+    req["zh_sim"] = req.apply(
+        lambda x: synonyms.compare(x["zh_property"], x["zh_p"]), axis = 1
+    )
+    req = req.sort_values(by = "zh_sim", ascending = False)
+    return req
+all_en_ents = pd.Series(spo_df_simple[["s", "o"]].values.reshape([-1])).drop_duplicates().values.tolist()
+all_ents_df = pd.Series(all_en_ents).reset_index().iloc[:, 1:]
+all_ents_df.columns = ["en_ent"]
+all_ents_df = all_ents_df[
+    all_ents_df["en_ent"] != "rdf:type"
+]
+all_ents_df["zh_ent"] = all_ents_df["en_ent"].map(
+    lambda x: spo_trans_dict.get(x.replace("hp:", "").replace("_", " "), x.replace("hp:", "").replace("_", " "))
+)
+#all_ents_df
+def search_sym_entity(entity_str,  all_ents_df, use_syn = False):
+    #zh_p_l = question_p_df["zh_property"].drop_duplicates().values.tolist()
+    #en_p_l = question_p_df["en_property"].drop_duplicates().values.tolist()
+    req = all_ents_df.copy()
+    req["entity_str"] = [entity_str] * len(req)
+    if use_syn:
+        req["zh_sim"] = req.apply(
+        lambda x: synonyms.compare(x["zh_ent"], x["entity_str"]), axis = 1
+    )
+    else:
+        req["zh_sim"] = req.apply(
+        lambda x: fuzz.ratio(x["zh_ent"], x["entity_str"]), axis = 1
+        )
+    req = req.sort_values(by = "zh_sim", ascending = False)
+    return req
+zh_question = "谁是占卜会议的领导者？"
+zh_question = "洛林出生在哪个国家？"
+zh_question = "洛林出生在哪个地方？"
+zh_question = "洛林的血缘是什么？"
+zh_question = "洛林的生日是什么？"
+zh_question = "洛林的家族是什么？"
+zh_question = "洛林的性别是什么？"
+zh_question = "洛林的标题是什么？"
+zh_question = "洛林的主题是什么？"
+zh_question = "这个物品的特征是什么？"
+zh_question = "强效祛斑药水的特征是什么？"
+zh_question = "魔法学校的成立日期是什么？"
+zh_question = "魔法学校的校长是谁？"
+question_p_df = zh_question_to_p_zh_en_map(zh_question)
+#question_p_df
+#### top en_p as consider (high zh_sim)
+#### need preload to precaculate all candidates in all_p_df
+sym_p_df = search_sym_p(question_p_df,  all_p_df)
+#sym_p_df
+'''
+#### this can be done, all related with translate accurate
+entity_str = "占卜会议"
+search_sym_entity(entity_str,  all_ents_df)
+#### re translate in massive times
+pd.Series(list(spo_trans_dict.keys())).to_csv("../data/all_consider.csv", index = False)
+'''
+#### ->
+'''
+sparql_queries_reconstruct = template_fullfill_reconstruct_query(
+    ["hp:Divination_homework_meeting"],
+    ["hp:leader"]
+)
+sparql_queries_reconstruct
+'''
+def from_zh_question_to_consider_queries(zh_question, top_k = 32, top_p_k = 5, top_e_k = 50, kgqa_retriever = kgqa_retriever,):
+    zh_ents = retrieve_et(zh_question)
+    if type(zh_ents) != type([]) or not zh_ents:
+        return None
+    question_p_df = zh_question_to_p_zh_en_map(zh_question, top_k = top_p_k)
+    if not hasattr(question_p_df, "size") or question_p_df.size == 0:
+        return None
+    ### en_p
+    sym_p_df = search_sym_p(question_p_df,  all_p_df)
+    if not hasattr(sym_p_df, "size") or sym_p_df.size == 0:
+        return None
+    sim_entity_df_list = []
+    for entity_str in zh_ents:
+        sym_ent_df = search_sym_entity(entity_str,  all_ents_df)
+        if not hasattr(sym_ent_df, "size") or sym_ent_df.size == 0:
+            continue
+        sim_entity_df_list.append(sym_ent_df)
+    if type(sim_entity_df_list) != type([]) or not sim_entity_df_list:
+        return None
+    #### en_ent
+    sym_ent_df = pd.concat(sim_entity_df_list, axis = 0).sort_values(by = "zh_sim", ascending = False)
+    #return sym_p_df, sym_ent_df
+    top_p = sym_p_df["en_p"].drop_duplicates().dropna().head(top_p_k).values.tolist()
+    top_e = sym_ent_df["en_ent"].drop_duplicates().dropna().head(top_e_k).values.tolist()
+    print(
+        top_e
+    )
+    print(
+        top_p
+    )
+    if not top_p or not top_e:
+        return None
+    sparql_queries_reconstruct = template_fullfill_reconstruct_query(
+        top_e,
+        top_p
+    )
+    #return sparql_queries_reconstruct
+    if not sparql_queries_reconstruct:
+        return None
+    output = run_sparql_queries(sparql_queries_reconstruct, kgqa_retriever, top_k = top_k)
+    return sparql_queries_reconstruct ,output
+def trans_output(zh_question ,output):
+    if type(output) != type([]):
+        return output
+    def single_trans(d):
+        assert type(d) == type({})
+        if not d:
+            return d
+        req = {}
+        answer = d.get("answer")
+        if type(answer) == type([]):
+            answer = list(map(lambda x:
+                         spo_trans_dict.get(x.split("/")[-1].replace("_", " "),
+                                           x.split("/")[-1].replace("_", " ")
+                                           ) if x.startswith("https://deepset.ai/harry_potter") else x
+                         , answer))
+        sparql_query = d.get("prediction_meta")
+        if sparql_query is not None:
+            sparql_query = sparql_query.get("sparql_query")
+        if type(sparql_query) == type(""):
+            t3_in_query = query_to_t3(sparql_query)
+            hp_l = pd.Series(np.asarray(t3_in_query).reshape([-1])).map(lambda x: x[3:] if x.startswith("hp:") else np.nan).dropna().drop_duplicates().values.tolist()
+            for ele in sorted(hp_l, key = len, reverse = True):
+                sparql_query = sparql_query.replace(ele, spo_trans_dict.get(ele.split("/")[-1].replace("_", " "),
+                                           ele.split("/")[-1].replace("_", " ")))
+        if answer is not None:
+            req["answer"] = answer
+        if sparql_query is not None:
+            req["sparql_query"] = sparql_query
+        return req
+    output_trans = list(map(single_trans, output))
+    output_trans = sorted(output_trans, key = lambda d:
+    synonyms.compare(zh_question, " " if  d.get("sparql_query", " ") else " ") if type(d) == type({}) else 0.0
+    , reverse = True)
+    return output_trans
+def ranking_output(zh_question, zh_output):
+    e_t_dict = retrieve_et(zh_question, only_e=False)
+    e = e_t_dict.get("E-TAG", [])
+    t = e_t_dict.get("T-TAG", [])
+    e, t = map(" ".join, [e, t])
+    print(e, t)
+    df = pd.DataFrame(zh_output)
+    df = df.explode("answer")
+    #### e query
+    df["e_score"] = df["sparql_query"].map(lambda x: re.findall("{(.*)}" ,x)[0]).map(lambda x:
+                                                    list(filter(lambda y: "?" not in y ,
+                                                                list(np.asarray(x.split())[[0, -1]])
+                                                               ))
+                                                                                    ).map(" ".join).map(lambda x:
+                                                                                                        [e, x.split(":")[-1]]
+                                                                                                       ).map(lambda x: list(map(lambda y:
+                                                                                                                                y.replace(" ", "") ,x))).map(lambda x:
+                                                                                                             fuzz.ratio(*x))
+    df["t_score"] = df["sparql_query"].map(lambda x: re.findall("{(.*)}" ,x)[0]).map(lambda x:
+                                                    list(filter(lambda y: "?" not in y ,
+                                                                x.split()[1]
+                                                               ))
+                                                                                    ).map(" ".join).map(lambda x:
+                                                                                                        [t, x.split(":")[-1]]
+                                                                                                       ).map(lambda x: list(map(lambda y:
+                                                                                                                                y.replace(" ", "") ,x))).map(lambda x:
+                                                                                                             fuzz.ratio(*x))
+    #df["a_score"] = df["answer"].map(lambda x: [x, t]).map(lambda x: synonyms.compare(*x)) * 100
+    df["et_score"] = df[["e_score", "t_score", ]].sum(axis = 1)
+    df = df.sort_values(by = "et_score", ascending = False)
+    if df["et_score"].iloc[0] >= 50:
+        return df
+    df["e_score"] = df["sparql_query"].map(lambda x: re.findall("{(.*)}" ,x)[0]).map(lambda x:
+                                                    list(filter(lambda y: "?" not in y ,
+                                                                list(np.asarray(x.split())[[0, -1]])
+                                                               ))
+                                                                                    ).map(" ".join).map(lambda x:
+                                                                                                        [e, x.split(":")[-1]]
+                                                                                                       ).map(lambda x: list(map(lambda y:
+                                                                                                                                y.replace(" ", "") ,x))).map(lambda x:
+                                                                                                             synonyms.compare(*x))
+    df["t_score"] = df["sparql_query"].map(lambda x: re.findall("{(.*)}" ,x)[0]).map(lambda x:
+                                                    list(filter(lambda y: "?" not in y ,
+                                                                x.split()[1]
+                                                               ))
+                                                                                    ).map(" ".join).map(lambda x:
+                                                                                                        [t, x.split(":")[-1]]
+                                                                                                       ).map(lambda x: list(map(lambda y:
+                                                                                                                                y.replace(" ", "") ,x))).map(lambda x:
+                                                                                                             synonyms.compare(*x))
+    #df["a_score"] = df["answer"].map(lambda x: [x, t]).map(lambda x: synonyms.compare(*x))
+    #df["a_score"] = df["a_score"] / 100.0
+    df["et_score"] = df[["e_score", "t_score", ]].sum(axis = 1)
+    df = df.sort_values(by = "et_score", ascending = False)
+    return df
+if __name__ == "__main__":
+    #### 血缘 need fintune， tackle with ranking_output
+    #### top3 to top5 recall design
+    zh_question = "哈利波特的血缘是什么？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "哈利波特的生日是什么？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "史内普的生日是什么时候？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "占卜会议的领导者是谁？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "纽约卫生局的创立时间是什么？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "法兰西魔法部记录室位于哪个城市？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "邓布利多的出生日期是什么？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "哥布林叛乱发生在什么日期？"
+    #output = from_zh_question_to_consider_queries(zh_question, top_p_k = 50)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "决斗比赛的参与者是谁？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)
+    zh_question = "赫敏的丈夫是谁？"
+    #output = from_zh_question_to_consider_queries(zh_question)
+    output = from_zh_question_to_consider_queries(zh_question,
+                                             top_k = 32, top_p_k = 30, top_e_k = 50
+                                             )
+    if type(output) == type((1,)):
+        query_list, output = output
+        zh_output = trans_output(zh_question ,output)
+    else:
+        zh_output = None
+    zh_output
+    ranking_output(zh_question, zh_output)