Spaces:
Running
Running
import json | |
import os | |
import tempfile | |
import gradio as gr | |
from utils import evaluate, report | |
from transformers import AutoTokenizer | |
def process_jsonl_file(jsonl_file_path: str, api_key: str): | |
try: | |
content = open(jsonl_file_path, "r", encoding="utf-8").readlines() | |
json_data = [json.loads(line) for line in content] | |
if api_key is not None and api_key != "": | |
json_data = evaluate(json_data, api_key) | |
html_content = report(tasks=json_data) | |
file_name_with_ext = os.path.basename(jsonl_file_path) | |
file_name, _ = os.path.splitext(file_name_with_ext) | |
with tempfile.NamedTemporaryFile( | |
delete=False, prefix=f"{file_name}-report-", suffix=".html", mode="w", encoding="utf-8" | |
) as temp_file: | |
temp_file.write(html_content) | |
output_file = temp_file.name | |
return output_file, "" | |
except Exception as e: | |
return None, e | |
# Gradioデモ | |
with gr.Blocks() as reporting: | |
jsonl_input = gr.File(label="JSONLファイルをアップロード") | |
api_key_input = gr.Textbox(label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password") | |
gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)") | |
process_button = gr.Button("レポートを作成") | |
output_file = gr.File(label="セルフ評価レポート") | |
output_text = gr.Textbox(label="システムメッセージ") | |
process_button.click( | |
process_jsonl_file, inputs=[jsonl_input, api_key_input], outputs=[output_file, output_text] | |
) | |
llm_jp_3 = "llm-jp/llm-jp-3-1.8b" | |
gemma_2 = "google/gemma-2-2b" | |
llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True) | |
gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True) | |
tokenizers = { | |
"LLM-JP-3": llm_jp_3_tokenizer, | |
"Gemma-2": gemma_2_tokenizer | |
} | |
def tokenize_text(text: str, tokenizer_name: str): | |
tokenizer = tokenizers[tokenizer_name] | |
tokens = tokenizer.tokenize(text) | |
colors = ['#FFCCCC', '#CCFFCC', '#CCCCFF', '#FFFFCC', '#CCFFFF', '#FFCCFF'] | |
tokenized_text = ''.join([f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> ' for i, token in enumerate(tokens)]) | |
token_count = len(tokens) | |
return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>" | |
with gr.Blocks() as tokenization: | |
with gr.Row(): | |
tokenizer_dropdown = gr.Dropdown(label="Tokenizerを選択", choices=["LLM-JP-3", "Gemma-2"], value="LLM-JP-3") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox(label="Input Text") | |
with gr.Column(): | |
tokenized_output = gr.HTML(label="Tokenized Output") | |
tokenizer_dropdown.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output) | |
text_input.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output) | |
tabbed = gr.TabbedInterface( | |
[reporting, tokenization], | |
tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークン化の可視化"], | |
title="LLM開発支援ツール" | |
) | |
if __name__ == "__main__": | |
tabbed.launch() | |