dingo / app.py
DataEval's picture
change demo value
bea9e81 verified
import json
import os
import shutil
import gradio as gr
from dingo.exec import Executor
from dingo.io import InputArgs
def dingo_demo(dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, model,
key, api_url):
if not data_format:
return 'ValueError: data_format can not be empty, please input.', None
if not column_content:
return 'ValueError: column_content can not be empty, please input.', None
if not rule_list and not prompt_list:
return 'ValueError: rule_list and prompt_list can not be empty at the same time.', None
# Handle input path based on dataset source
if dataset_source == "hugging_face":
if not input_path:
return 'ValueError: input_path can not be empty for hugging_face dataset, please input.', None
final_input_path = input_path
else: # local
if not uploaded_file:
return 'ValueError: Please upload a file for local dataset.', None
final_input_path = uploaded_file.name
input_data = {
"dataset": dataset_source,
"input_path": final_input_path,
"output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path),
"save_data": True,
"save_raw": True,
"data_format": data_format,
"column_content": column_content,
"custom_config":
{
"rule_list": rule_list,
"prompt_list": prompt_list,
"llm_config":
{
"detect_text_quality_detail":
{
"model": model,
"key": key,
"api_url": api_url,
}
}
}
}
input_args = InputArgs(**input_data)
executor = Executor.exec_map["local"](input_args)
executor.execute()
summary = executor.get_summary().to_dict()
detail = executor.get_bad_info_list()
new_detail = []
for item in detail:
new_detail.append(item.to_raw_dict())
if summary['output_path']:
shutil.rmtree(summary['output_path'])
# 返回两个值:概要信息和详细信息
return json.dumps(summary, indent=4), new_detail
def update_input_components(dataset_source):
# 根据数据源的不同,返回不同的输入组件
if dataset_source == "hugging_face":
# 如果数据源是huggingface,返回一个可见的文本框和一个不可见的文件组件
return [
gr.Textbox(visible=True),
gr.File(visible=False),
]
else: # local
# 如果数据源是本地,返回一个不可见的文本框和一个可见的文件组件
return [
gr.Textbox(visible=False),
gr.File(visible=True),
]
if __name__ == '__main__':
rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl']
prompt_options = ['PromptRepeat', 'PromptContentChaos']
with open("header.html", "r") as file:
header = file.read()
with gr.Blocks() as demo:
gr.HTML(header)
with gr.Row():
with gr.Column():
with gr.Column():
dataset_source = gr.Dropdown(
choices=["hugging_face", "local"],
value="hugging_face",
label="dataset [source]"
)
input_path = gr.Textbox(
value='chupei/format-jsonl',
placeholder="please input hugging_face dataset path",
label="input_path",
visible=True
)
uploaded_file = gr.File(
label="upload file",
visible=False
)
data_format = gr.Dropdown(
["jsonl", "json", "plaintext", "listjson"],
label="data_format"
)
column_content = gr.Textbox(
value="content",
placeholder="please input column name of content in dataset",
label="column_content"
)
rule_list = gr.CheckboxGroup(
choices=rule_options,
value=['RuleAbnormalChar', 'RuleAbnormalHtml'],
label="rule_list"
)
prompt_list = gr.CheckboxGroup(
choices=prompt_options,
label="prompt_list"
)
model = gr.Textbox(
placeholder="If want to use llm, please input model, such as: deepseek-chat",
label="model"
)
key = gr.Textbox(
placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx",
label="API KEY"
)
api_url = gr.Textbox(
placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1",
label="API URL"
)
with gr.Row():
submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
with gr.Column():
# 修改输出组件部分,使用Tabs
with gr.Tabs():
with gr.Tab("Result Summary"):
summary_output = gr.Textbox(label="summary", max_lines=50)
with gr.Tab("Result Detail"):
detail_output = gr.JSON(label="detail", max_height=800) # 使用JSON组件来更好地展示结构化数据
dataset_source.change(
fn=update_input_components,
inputs=dataset_source,
outputs=[input_path, uploaded_file]
)
submit_single.click(
fn=dingo_demo,
inputs=[dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list,
model, key, api_url],
outputs=[summary_output, detail_output] # 修改输出为两个组件
)
# 启动界面
demo.launch()