import streamlit as st from datasets import load_dataset import datetime as dt import random import json import os from huggingface_hub import HfApi, CommitScheduler import uuid HF_API_KEY = os.environ.get("HF_TOKEN", None) api = HfApi(token=HF_API_KEY) REPO_ID = "imomayiz/darija-english" DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}" submissions_folder = "submissions" submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json") os.makedirs(submissions_folder, exist_ok=True) scheduler = CommitScheduler( token=HF_API_KEY, repo_id=REPO_ID, repo_type="dataset", folder_path=submissions_folder, path_in_repo="submissions", every=1, ) def load_data(repo_id): dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences') return dataset def fetch_sentence(dataset, column_name="darija_ar"): # Get a random sentence random_sentence_index = random.randint(0, len(dataset) - 1) random_sentence = dataset[random_sentence_index][column_name] st.session_state.sentence = random_sentence st.session_state.translation_input = "" st.session_state.translation_input_fr = "" return random_sentence def store_submission(api: HfApi, sentence: str, translation: str, translation_fr: str): """ Append input/outputs and user feedback to a JSON Lines file using a thread lock to avoid concurrent writes from different users. """ ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f") # folder_path = "submissions" # os.makedirs(folder_path, exist_ok=True) # filename = os.path.join(folder_path, f"submissions_{ts}.txt") # with open(filename, "w", encoding="utf-8") as f: # f.write(f"darija,eng,darija_ar\n{sentence},{translation},{translation_fr}") # api.upload_file( # path_or_fileobj=filename, # path_in_repo=filename, # repo_id=REPO_ID, # repo_type="dataset", # ) with scheduler.lock: with submissions_file.open("a") as f: f.write(json.dumps({ "darija": translation_fr, "eng": translation, "darija_ar": sentence})) f.write("\n") st.success( f"""Translation submitted successfully to {DATASET_REPO_URL}/tree/main/{submissions_folder}""" ) # Load the dataset dataset = load_data(REPO_ID) if "sentence" not in st.session_state: st.session_state.sentence = fetch_sentence(dataset) if 'translation_input' not in st.session_state: st.session_state.translation_input = "" if 'translation_input_fr' not in st.session_state: st.session_state.translation_input_fr = "" if 'display_new' not in st.session_state: st.session_state.display_new = False st.title("Translate From Arabic to English") st.markdown( """This mini-app allows you to contribute to the **darija-english** dataset as part of [DODa](https://darija-open-dataset.github.io/) project. To contribute, simply translate the given sentence from Arabic to English. The translated sentence will be submitted to the dataset [here](https://huggingface.co/datasets/imomayiz/darija-english).""" ) st.divider() st.write(f"""

{st.session_state.sentence}.

""", unsafe_allow_html=True) # Display new sentence button st.session_state.display_new = st.button("New Sentence", on_click=fetch_sentence, args=(dataset,)) # Input field for translation translation_input_placeholder = st.empty() with translation_input_placeholder.container(): translation_input = st.text_input("Enter translation to english: ", st.session_state.translation_input) st.session_state.translation_input = translation_input # Input field for translation translation_input_placeholder_fr = st.empty() with translation_input_placeholder_fr.container(): translation_input_fr = st.text_input( "Enter translation to darija in latin characters: ", st.session_state.translation_input_fr ) st.session_state.translation_input_fr = translation_input_fr if st.button("Submit Translation"): if not st.session_state.translation_input_fr or st.session_state.translation_input: st.warning("Please enter a translation before submitting.") else: store_submission(api, st.session_state.sentence, st.session_state.translation_input, st.session_state.translation_input_fr )