import os import json import shutil import gradio as gr from datasets import load_dataset from huggingface_hub import upload_file from io import StringIO import pandas as pd import datetime HF_TOKEN = os.environ.get("HF_TOKEN", None) DIALOGUES_DATASET = "ArmelRandy/MT_dialogues" def load_data(): dataset = load_dataset("ArmelR/oasst1_guanaco_english", use_auth_token=HF_TOKEN) return dataset samples = load_data() splits = list(samples.keys()) languages = ["Wolof"] print(f"current directory {os.getcwd()}") print(f"total path {os.path.dirname(os.path.realpath(__file__))}") custom_css = """ #banner-image { display: block; margin-left: auto; margin-right: auto; } #chat-message { font-size: 14px; min-height: 300px; } """ def caller_split(s): return 0, samples[s][0]["prompt"], samples[s][0]["completion"] def identity(index, split): ds = samples[split][index] return ds["prompt"], ds["completion"] def save(index, language, split, prompt, completion): buffer = StringIO() now = datetime.datetime.now() timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") file_name = f"prompts_{timestamp}.jsonl" if len(prompt) != 0 and len(completion) != 0 : print("Saving ...") data = {"prompt": prompt, "completion": completion, "language": language, "index": index} pd.DataFrame([data]).to_json(buffer, orient="records", lines=True) # Push to Hub upload_file( path_in_repo=f"{now.date()}/{now.hour}/{file_name}", path_or_fileobj=buffer.getvalue().encode(), repo_id=DIALOGUES_DATASET, token=HF_TOKEN, repo_type="dataset", ) # Clean and rerun buffer.close() next_index = min(1+index, len(samples[split])-1) return next_index, samples[split][next_index]["prompt"], samples[split][next_index]["completion"], "", "" else : return index, samples[split][index]["prompt"], samples[split][index]["completion"], "", "" with gr.Blocks(analytics_enabled=False, css=custom_css) as demo: gr.HTML("""

MT💫

""") # gr.Markdown("""""") with gr.Blocks(): with gr.Row() : split = gr.Dropdown(choices=splits, label="Dataset split", value=splits[0]) with gr.Row() : index_example = gr.Slider(minimum=0, maximum=10000, step=1, value=0, interactive=True, info=f"Index of the chosen instruction-output pair.") with gr.Row() : with gr.Column(): prompt = gr.Textbox(label="prompt") with gr.Column(): completion = gr.Code(label="Completion") with gr.Blocks(): with gr.Row() : language = gr.Dropdown(choices=languages, label="Translation language", value=languages[0]) with gr.Row() : with gr.Column() : translated_prompt = gr.Textbox(label="Translated prompt") with gr.Column() : translated_completion = gr.Textbox(label="Translated completion") with gr.Row() : button = gr.Button(value="Submit") split.change(caller_split, inputs=[split], outputs=[index_example, prompt, completion]) index_example.release(identity, inputs=[index_example, split], outputs=[prompt, completion]) button.click(save, inputs=[index_example, language, split, translated_prompt, translated_completion], outputs=[index_example, prompt, completion, translated_prompt, translated_completion]) demo.launch(debug=True)