Spaces:

Ali-C137
/

Dar-En-Translation-streamlit-Test2

Runtime error

App Files Files Community

add slider to Auto-Translate

by Ali-C137 - opened Nov 17, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+44

-264

Files changed (1) hide show

app.py +44 -264

app.py CHANGED Viewed

@@ -1,200 +1,44 @@
 import os
 import time
-import uuid
 import random
-import datetime
 import pandas as pd
-from typing import Any, Dict, List, Optional, Union
-from pathlib import Path
-import tempfile
-import pyarrow as pa
-import pyarrow.parquet as pq
 import streamlit as st
 import huggingface_hub as hf
-from huggingface_hub import HfApi, login, CommitScheduler
 from datasets import load_dataset
 import openai
 from openai import OpenAI
 # File Path
-# DATA_PATH = "Dr-En-space-test.csv"
-# DATA_REPO = "M-A-D/dar-en-space-test"
-DATA_REPO = "M-A-D/DarijaBridge"
 api = hf.HfApi()
 access_token_write = "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"
 login(token=access_token_write)
-repo_id = "M-A-D/dar-en-space-test"
-st.set_page_config(layout="wide")
-# Initialize the ParquetScheduler
-class ParquetScheduler(CommitScheduler):
-    """
-    Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
-    call will result in 1 row in your final dataset.
-    ```py
-    # Start scheduler
-    >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
-    # Append some data to be uploaded
-    >>> scheduler.append({...})
-    >>> scheduler.append({...})
-    >>> scheduler.append({...})
-    ```
-    The scheduler will automatically infer the schema from the data it pushes.
-    Optionally, you can manually set the schema yourself:
-    ```py
-    >>> scheduler = ParquetScheduler(
-    ...     repo_id="my-parquet-dataset",
-    ...     schema={
-    ...         "prompt": {"_type": "Value", "dtype": "string"},
-    ...         "negative_prompt": {"_type": "Value", "dtype": "string"},
-    ...         "guidance_scale": {"_type": "Value", "dtype": "int64"},
-    ...         "image": {"_type": "Image"},
-    ...     },
-    ... )
-    See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
-    possible values.
-    """
-    def __init__(
-        self,
-        *,
-        repo_id: str,
-        schema: Optional[Dict[str, Dict[str, str]]] = None,
-        every: Union[int, float] = 5,
-        path_in_repo: Optional[str] = "data",
-        repo_type: Optional[str] = "dataset",
-        revision: Optional[str] = None,
-        private: bool = False,
-        token: Optional[str] = None,
-        allow_patterns: Union[List[str], str, None] = None,
-        ignore_patterns: Union[List[str], str, None] = None,
-        hf_api: Optional[HfApi] = None,
-    ) -> None:
-        super().__init__(
-            repo_id=repo_id,
-            folder_path="dummy",  # not used by the scheduler
-            every=every,
-            path_in_repo=path_in_repo,
-            repo_type=repo_type,
-            revision=revision,
-            private=private,
-            token=token,
-            allow_patterns=allow_patterns,
-            ignore_patterns=ignore_patterns,
-            hf_api=hf_api,
-        )
-        self._rows: List[Dict[str, Any]] = []
-        self._schema = schema
-    def append(self, row: Dict[str, Any]) -> None:
-        """Add a new item to be uploaded."""
-        with self.lock:
-            self._rows.append(row)
-    def push_to_hub(self):
-        # Check for new rows to push
-        with self.lock:
-            rows = self._rows
-            self._rows = []
-        if not rows:
-            return
-        print(f"Got {len(rows)} item(s) to commit.")
-        # Load images + create 'features' config for datasets library
-        schema: Dict[str, Dict] = self._schema or {}
-        path_to_cleanup: List[Path] = []
-        for row in rows:
-            for key, value in row.items():
-                # Infer schema (for `datasets` library)
-                if key not in schema:
-                    schema[key] = _infer_schema(key, value)
-                # Load binary files if necessary
-                if schema[key]["_type"] in ("Image", "Audio"):
-                    # It's an image or audio: we load the bytes and remember to cleanup the file
-                    file_path = Path(value)
-                    if file_path.is_file():
-                        row[key] = {
-                            "path": file_path.name,
-                            "bytes": file_path.read_bytes(),
-                        }
-                        path_to_cleanup.append(file_path)
-        # Complete rows if needed
-        for row in rows:
-            for feature in schema:
-                if feature not in row:
-                    row[feature] = None
-        # Export items to Arrow format
-        table = pa.Table.from_pylist(rows)
-        # Add metadata (used by datasets library)
-        table = table.replace_schema_metadata(
-            {"huggingface": json.dumps({"info": {"features": schema}})}
-        )
-        # Write to parquet file
-        archive_file = tempfile.NamedTemporaryFile()
-        pq.write_table(table, archive_file.name)
-        # Upload
-        self.api.upload_file(
-            repo_id=self.repo_id,
-            repo_type=self.repo_type,
-            revision=self.revision,
-            path_in_repo=f"{uuid.uuid4()}.parquet",
-            path_or_fileobj=archive_file.name,
-        )
-        print(f"Commit completed.")
-        # Cleanup
-        archive_file.close()
-        for path in path_to_cleanup:
-            path.unlink(missing_ok=True)
-# Define the ParquetScheduler instance with your repo details
-scheduler = ParquetScheduler(repo_id=repo_id)
-# Function to append new translation data to the ParquetScheduler
-def append_translation_data(original, translation, translated, corrected=False):
-    data = {
-        "original": original,
-        "translation": translation,
-        "translated": translated,
-        "corrected": corrected,
-        "timestamp": datetime.datetime.utcnow().isoformat(),
-        "id": str(uuid.uuid4())  # Unique identifier for each translation
-    }
-    scheduler.append(data)
 # Load data
 def load_data():
     return pd.DataFrame(load_dataset(DATA_REPO,download_mode="force_redownload",split='test'))
-#def save_data(data):
-#   data.to_csv(DATA_PATH, index=False)
-#    # to_save = datasets.Dataset.from_pandas(data)
-#    api.upload_file(
-#    path_or_fileobj="./Dr-En-space-test.csv",
-#    path_in_repo="Dr-En-space-test.csv",
-#    repo_id=DATA_REPO,
-#    repo_type="dataset",
-#)
-#    # to_save.push_to_hub(DATA_REPO)
 def skip_correction():
     noncorrected_sentences = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['sentence'].tolist()
@@ -205,22 +49,7 @@ def skip_correction():
         st.session_state.orig_sentence = "No more sentences to be corrected"
         st.session_state.orig_translation = "No more sentences to be corrected"
-st.title("""
-            Darija Translation Corpus Collection
-            **What This Space Is For:**
-            - **Translating Darija to English:** Add your translations here.
-            - **Correcting Translations:** Review and correct existing translations.
-            - **Using GPT-4 for Auto-Translation:** Try auto-translating Darija sentences.
-            - **Helping Develop Darija Language Resources:** Your translations make a difference.
-            **How to Contribute:**
-            - **Choose a Tab:** Translation, Correction, or Auto-Translate.
-            - **Add or Correct Translations:** Use text areas to enter translations.
-            - **Save Your Work:** Click 'Save' to submit.
-            **Every Contribution Counts! Let's make Darija GREAT!**
-            """)
 if "data" not in st.session_state:
     st.session_state.data = load_data()
@@ -247,29 +76,11 @@ if "user_translation" not in st.session_state:
     st.session_state.user_translation = ""
-# with st.sidebar:
-#     st.subheader("About")
-#     st.markdown("""This is app is designed to collect Darija translation corpus.""")
-# with st.sidebar:
-#     st.subheader("About")
-#     st.markdown("""
-#             ### Darija Translation Corpus Collection
-#             **What This Space Is For:**
-#             - **Translating Darija to English:** Add your translations here.
-#             - **Correcting Translations:** Review and correct existing translations.
-#             - **Using GPT-4 for Auto-Translation:** Try auto-translating Darija sentences.
-#             - **Helping Develop Darija Language Resources:** Your translations make a difference.
-#             **How to Contribute:**
-#             - **Choose a Tab:** Translation, Correction, or Auto-Translate.
-#             - **Add or Correct Translations:** Use text areas to enter translations.
-#             - **Save Your Work:** Click 'Save' to submit.
-#             **Every Contribution Counts! Let's make Darija GREAT!**
-#     """)
 tab1, tab2, tab3 = st.tabs(["Translation", "Correction", "Auto-Translate"])
 with tab1:
@@ -284,13 +95,12 @@ with tab1:
     if st.button("💾 Save"):
         if st.session_state.user_translation:
-            # Append data to be saved
-            append_translation_data(
-                original=st.session_state.sentence,
-                translation=st.session_state.user_translation,
-                translated=True
-            )
-            st.session_state.user_translation = ""
             # st.toast("Saved!", icon="👏")
             st.success("Saved!")
@@ -306,7 +116,6 @@ with tab1:
             # Rerun the app
             st.rerun()
 with tab2:
     with st.container():
         st.subheader("Original Darija Text:")
@@ -321,13 +130,11 @@ with tab2:
     if st.button("💾 Save Translation"):
         if corrected_translation:
-            # Append data to be saved
-            append_translation_data(
-                original=st.session_state.orig_sentence,
-                translation=corrected_translation,
-                translated=True,
-                corrected=True
-            )
             st.success("Saved!")
             # Update the sentence for the next iteration.
@@ -349,15 +156,8 @@ with tab3:
     # User input for OpenAI API key
     openai_api_key = st.text_input("Paste your OpenAI API key:")
-    # Slider for the user to choose the number of samples to translate
-    num_samples = st.slider("Select the number of samples to translate", min_value=1, max_value=100, value=10)
-    # Estimated cost display
-    cost = num_samples * 0.0012
-    st.write(f"The estimated cost for translating {num_samples} samples is: ${cost:.4f}")
-    if st.button("Do the MAGIC with Auto-Translate ✨"):
         if openai_api_key:
             openai.api_key = openai_api_key
@@ -369,22 +169,9 @@ with tab3:
             # Get 10 samples from the dataset for translation
             samples_to_translate = st.session_state.data.sample(10)['sentence'].tolist()
-            # # System prompt for translation assistant
-            # translation_prompt = """
-            # You are a helpful AI-powered translation assistant designed for users seeking reliable translation assistance. Your primary function is to provide context-aware translations from Moroccan Arabic (Darija) to English.
-            # """
-            # auto_translations = []
-            # for sentence in samples_to_translate:
-            #     # Create messages for the chat model
-            #     messages = [
-            #         {"role": "system", "content": translation_prompt},
-            #         {"role": "user", "content": f"Translate the following sentence to English: '{sentence}'"}
-            #     ]
             # System prompt for translation assistant
-            translation_system_prompt = """
-            You are a native speaker of both Moroccan Arabic (Darija) and English. You are an expert of translations from Moroccan Arabic (Darija) into English.
             """
             auto_translations = []
@@ -392,8 +179,8 @@ with tab3:
             for sentence in samples_to_translate:
                 # Create messages for the chat model
                 messages = [
-                    {"role": "system", "content": translation_system_prompt},
-                    {"role": "user", "content": f"Translate the following sentence from Moroccan Arabic (Darija) to English, only return the translated sentence: '{sentence}'"}
                 ]
                 # Perform automatic translation using OpenAI GPT-3.5-turbo model
@@ -416,17 +203,10 @@ with tab3:
                 'translation'
             ] = auto_translations
-            # Append data to be saved
-            append_translation_data(
-                original=st.session_state.orig_sentence,
-                translation=corrected_translation,
-                translated=True,
-                corrected=True
-            )
             st.success("Auto-Translations saved!")
         else:
             st.warning("Please paste your OpenAI API key.")

 import os
 import time
 import random
 import pandas as pd
 import streamlit as st
 import huggingface_hub as hf
+from huggingface_hub import login
+import datasets
 from datasets import load_dataset
 import openai
 from openai import OpenAI
 # File Path
+DATA_PATH = "Dr-En-space-test.csv"
+DATA_REPO = "M-A-D/dar-en-space-test"
+st.set_page_config(layout="wide")
 api = hf.HfApi()
 access_token_write = "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"
 login(token=access_token_write)
 # Load data
 def load_data():
     return pd.DataFrame(load_dataset(DATA_REPO,download_mode="force_redownload",split='test'))
+def save_data(data):
+    data.to_csv(DATA_PATH, index=False)
+    # to_save = datasets.Dataset.from_pandas(data)
+    api.upload_file(
+    path_or_fileobj="./Dr-En-space-test.csv",
+    path_in_repo="Dr-En-space-test.csv",
+    repo_id=DATA_REPO,
+    repo_type="dataset",
+)
+    # to_save.push_to_hub(DATA_REPO)
 def skip_correction():
     noncorrected_sentences = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['sentence'].tolist()
         st.session_state.orig_sentence = "No more sentences to be corrected"
         st.session_state.orig_translation = "No more sentences to be corrected"
+st.title("Darija Translation Corpus Collection")
 if "data" not in st.session_state:
     st.session_state.data = load_data()
     st.session_state.user_translation = ""
+with st.sidebar:
+    st.subheader("About")
+    st.markdown("""This is app is designed to collect Darija translation corpus.""")
+# tab1, tab2 = st.tabs(["Translation", "Correction"])
 tab1, tab2, tab3 = st.tabs(["Translation", "Correction", "Auto-Translate"])
 with tab1:
     if st.button("💾 Save"):
         if st.session_state.user_translation:
+            st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.sentence, 'translation'] = st.session_state.user_translation
+            st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.sentence, 'translated'] = True
+            save_data(st.session_state.data)
+            st.session_state.user_translation = ""  # Reset the input value after saving
             # st.toast("Saved!", icon="👏")
             st.success("Saved!")
             # Rerun the app
             st.rerun()
 with tab2:
     with st.container():
         st.subheader("Original Darija Text:")
     if st.button("💾 Save Translation"):
         if corrected_translation:
+            st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.orig_sentence, 'translation'] = corrected_translation
+            st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.orig_sentence, 'correction'] = corrected_translation
+            st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.orig_sentence, 'corrected'] = True
+            save_data(st.session_state.data)
             st.success("Saved!")
             # Update the sentence for the next iteration.
     # User input for OpenAI API key
     openai_api_key = st.text_input("Paste your OpenAI API key:")
+    if st.button("Auto-Translate 10 Samples"):
         if openai_api_key:
             openai.api_key = openai_api_key
             # Get 10 samples from the dataset for translation
             samples_to_translate = st.session_state.data.sample(10)['sentence'].tolist()
             # System prompt for translation assistant
+            translation_prompt = """
+            You are a helpful AI-powered translation assistant designed for users seeking reliable translation assistance. Your primary function is to provide context-aware translations from Moroccan Arabic (Darija) to English.
             """
             auto_translations = []
             for sentence in samples_to_translate:
                 # Create messages for the chat model
                 messages = [
+                    {"role": "system", "content": translation_prompt},
+                    {"role": "user", "content": f"Translate the following sentence to English: '{sentence}'"}
                 ]
                 # Perform automatic translation using OpenAI GPT-3.5-turbo model
                 'translation'
             ] = auto_translations
+            # Save the updated dataset
+            save_data(st.session_state.data)
             st.success("Auto-Translations saved!")
         else:
             st.warning("Please paste your OpenAI API key.")