add slider to Auto-Translate

#1
by Ali-C137 - opened
Files changed (1) hide show
  1. app.py +44 -264
app.py CHANGED
@@ -1,200 +1,44 @@
1
  import os
2
  import time
3
- import uuid
4
  import random
5
- import datetime
6
  import pandas as pd
7
- from typing import Any, Dict, List, Optional, Union
8
- from pathlib import Path
9
- import tempfile
10
- import pyarrow as pa
11
- import pyarrow.parquet as pq
12
 
13
  import streamlit as st
 
14
  import huggingface_hub as hf
15
- from huggingface_hub import HfApi, login, CommitScheduler
 
 
16
  from datasets import load_dataset
 
17
  import openai
18
  from openai import OpenAI
19
 
 
20
  # File Path
21
- # DATA_PATH = "Dr-En-space-test.csv"
22
- # DATA_REPO = "M-A-D/dar-en-space-test"
23
- DATA_REPO = "M-A-D/DarijaBridge"
 
24
 
25
  api = hf.HfApi()
26
  access_token_write = "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"
27
  login(token=access_token_write)
28
- repo_id = "M-A-D/dar-en-space-test"
29
-
30
- st.set_page_config(layout="wide")
31
-
32
- # Initialize the ParquetScheduler
33
- class ParquetScheduler(CommitScheduler):
34
- """
35
- Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
36
- call will result in 1 row in your final dataset.
37
-
38
- ```py
39
- # Start scheduler
40
- >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
41
-
42
- # Append some data to be uploaded
43
- >>> scheduler.append({...})
44
- >>> scheduler.append({...})
45
- >>> scheduler.append({...})
46
- ```
47
-
48
- The scheduler will automatically infer the schema from the data it pushes.
49
- Optionally, you can manually set the schema yourself:
50
-
51
- ```py
52
- >>> scheduler = ParquetScheduler(
53
- ... repo_id="my-parquet-dataset",
54
- ... schema={
55
- ... "prompt": {"_type": "Value", "dtype": "string"},
56
- ... "negative_prompt": {"_type": "Value", "dtype": "string"},
57
- ... "guidance_scale": {"_type": "Value", "dtype": "int64"},
58
- ... "image": {"_type": "Image"},
59
- ... },
60
- ... )
61
-
62
- See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
63
- possible values.
64
- """
65
-
66
- def __init__(
67
- self,
68
- *,
69
- repo_id: str,
70
- schema: Optional[Dict[str, Dict[str, str]]] = None,
71
- every: Union[int, float] = 5,
72
- path_in_repo: Optional[str] = "data",
73
- repo_type: Optional[str] = "dataset",
74
- revision: Optional[str] = None,
75
- private: bool = False,
76
- token: Optional[str] = None,
77
- allow_patterns: Union[List[str], str, None] = None,
78
- ignore_patterns: Union[List[str], str, None] = None,
79
- hf_api: Optional[HfApi] = None,
80
- ) -> None:
81
- super().__init__(
82
- repo_id=repo_id,
83
- folder_path="dummy", # not used by the scheduler
84
- every=every,
85
- path_in_repo=path_in_repo,
86
- repo_type=repo_type,
87
- revision=revision,
88
- private=private,
89
- token=token,
90
- allow_patterns=allow_patterns,
91
- ignore_patterns=ignore_patterns,
92
- hf_api=hf_api,
93
- )
94
-
95
- self._rows: List[Dict[str, Any]] = []
96
- self._schema = schema
97
-
98
- def append(self, row: Dict[str, Any]) -> None:
99
- """Add a new item to be uploaded."""
100
- with self.lock:
101
- self._rows.append(row)
102
-
103
- def push_to_hub(self):
104
- # Check for new rows to push
105
- with self.lock:
106
- rows = self._rows
107
- self._rows = []
108
- if not rows:
109
- return
110
- print(f"Got {len(rows)} item(s) to commit.")
111
-
112
- # Load images + create 'features' config for datasets library
113
- schema: Dict[str, Dict] = self._schema or {}
114
- path_to_cleanup: List[Path] = []
115
- for row in rows:
116
- for key, value in row.items():
117
- # Infer schema (for `datasets` library)
118
- if key not in schema:
119
- schema[key] = _infer_schema(key, value)
120
-
121
- # Load binary files if necessary
122
- if schema[key]["_type"] in ("Image", "Audio"):
123
- # It's an image or audio: we load the bytes and remember to cleanup the file
124
- file_path = Path(value)
125
- if file_path.is_file():
126
- row[key] = {
127
- "path": file_path.name,
128
- "bytes": file_path.read_bytes(),
129
- }
130
- path_to_cleanup.append(file_path)
131
-
132
- # Complete rows if needed
133
- for row in rows:
134
- for feature in schema:
135
- if feature not in row:
136
- row[feature] = None
137
-
138
- # Export items to Arrow format
139
- table = pa.Table.from_pylist(rows)
140
-
141
- # Add metadata (used by datasets library)
142
- table = table.replace_schema_metadata(
143
- {"huggingface": json.dumps({"info": {"features": schema}})}
144
- )
145
-
146
- # Write to parquet file
147
- archive_file = tempfile.NamedTemporaryFile()
148
- pq.write_table(table, archive_file.name)
149
-
150
- # Upload
151
- self.api.upload_file(
152
- repo_id=self.repo_id,
153
- repo_type=self.repo_type,
154
- revision=self.revision,
155
- path_in_repo=f"{uuid.uuid4()}.parquet",
156
- path_or_fileobj=archive_file.name,
157
- )
158
- print(f"Commit completed.")
159
-
160
- # Cleanup
161
- archive_file.close()
162
- for path in path_to_cleanup:
163
- path.unlink(missing_ok=True)
164
-
165
-
166
-
167
- # Define the ParquetScheduler instance with your repo details
168
- scheduler = ParquetScheduler(repo_id=repo_id)
169
-
170
-
171
- # Function to append new translation data to the ParquetScheduler
172
- def append_translation_data(original, translation, translated, corrected=False):
173
- data = {
174
- "original": original,
175
- "translation": translation,
176
- "translated": translated,
177
- "corrected": corrected,
178
- "timestamp": datetime.datetime.utcnow().isoformat(),
179
- "id": str(uuid.uuid4()) # Unique identifier for each translation
180
- }
181
- scheduler.append(data)
182
-
183
 
184
  # Load data
185
  def load_data():
186
  return pd.DataFrame(load_dataset(DATA_REPO,download_mode="force_redownload",split='test'))
187
 
188
- #def save_data(data):
189
- # data.to_csv(DATA_PATH, index=False)
190
- # # to_save = datasets.Dataset.from_pandas(data)
191
- # api.upload_file(
192
- # path_or_fileobj="./Dr-En-space-test.csv",
193
- # path_in_repo="Dr-En-space-test.csv",
194
- # repo_id=DATA_REPO,
195
- # repo_type="dataset",
196
- #)
197
- # # to_save.push_to_hub(DATA_REPO)
198
 
199
  def skip_correction():
200
  noncorrected_sentences = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['sentence'].tolist()
@@ -205,22 +49,7 @@ def skip_correction():
205
  st.session_state.orig_sentence = "No more sentences to be corrected"
206
  st.session_state.orig_translation = "No more sentences to be corrected"
207
 
208
- st.title("""
209
- Darija Translation Corpus Collection
210
-
211
- **What This Space Is For:**
212
- - **Translating Darija to English:** Add your translations here.
213
- - **Correcting Translations:** Review and correct existing translations.
214
- - **Using GPT-4 for Auto-Translation:** Try auto-translating Darija sentences.
215
- - **Helping Develop Darija Language Resources:** Your translations make a difference.
216
-
217
- **How to Contribute:**
218
- - **Choose a Tab:** Translation, Correction, or Auto-Translate.
219
- - **Add or Correct Translations:** Use text areas to enter translations.
220
- - **Save Your Work:** Click 'Save' to submit.
221
-
222
- **Every Contribution Counts! Let's make Darija GREAT!**
223
- """)
224
 
225
  if "data" not in st.session_state:
226
  st.session_state.data = load_data()
@@ -247,29 +76,11 @@ if "user_translation" not in st.session_state:
247
  st.session_state.user_translation = ""
248
 
249
 
250
- # with st.sidebar:
251
- # st.subheader("About")
252
- # st.markdown("""This is app is designed to collect Darija translation corpus.""")
253
-
254
- # with st.sidebar:
255
- # st.subheader("About")
256
- # st.markdown("""
257
- # ### Darija Translation Corpus Collection
258
-
259
- # **What This Space Is For:**
260
- # - **Translating Darija to English:** Add your translations here.
261
- # - **Correcting Translations:** Review and correct existing translations.
262
- # - **Using GPT-4 for Auto-Translation:** Try auto-translating Darija sentences.
263
- # - **Helping Develop Darija Language Resources:** Your translations make a difference.
264
-
265
- # **How to Contribute:**
266
- # - **Choose a Tab:** Translation, Correction, or Auto-Translate.
267
- # - **Add or Correct Translations:** Use text areas to enter translations.
268
- # - **Save Your Work:** Click 'Save' to submit.
269
-
270
- # **Every Contribution Counts! Let's make Darija GREAT!**
271
- # """)
272
 
 
273
  tab1, tab2, tab3 = st.tabs(["Translation", "Correction", "Auto-Translate"])
274
 
275
  with tab1:
@@ -284,13 +95,12 @@ with tab1:
284
 
285
  if st.button("💾 Save"):
286
  if st.session_state.user_translation:
287
- # Append data to be saved
288
- append_translation_data(
289
- original=st.session_state.sentence,
290
- translation=st.session_state.user_translation,
291
- translated=True
292
- )
293
- st.session_state.user_translation = ""
294
  # st.toast("Saved!", icon="👏")
295
  st.success("Saved!")
296
 
@@ -306,7 +116,6 @@ with tab1:
306
  # Rerun the app
307
  st.rerun()
308
 
309
-
310
  with tab2:
311
  with st.container():
312
  st.subheader("Original Darija Text:")
@@ -321,13 +130,11 @@ with tab2:
321
 
322
  if st.button("💾 Save Translation"):
323
  if corrected_translation:
324
- # Append data to be saved
325
- append_translation_data(
326
- original=st.session_state.orig_sentence,
327
- translation=corrected_translation,
328
- translated=True,
329
- corrected=True
330
- )
331
  st.success("Saved!")
332
 
333
  # Update the sentence for the next iteration.
@@ -349,15 +156,8 @@ with tab3:
349
 
350
  # User input for OpenAI API key
351
  openai_api_key = st.text_input("Paste your OpenAI API key:")
352
-
353
- # Slider for the user to choose the number of samples to translate
354
- num_samples = st.slider("Select the number of samples to translate", min_value=1, max_value=100, value=10)
355
-
356
- # Estimated cost display
357
- cost = num_samples * 0.0012
358
- st.write(f"The estimated cost for translating {num_samples} samples is: ${cost:.4f}")
359
-
360
- if st.button("Do the MAGIC with Auto-Translate ✨"):
361
  if openai_api_key:
362
  openai.api_key = openai_api_key
363
 
@@ -369,22 +169,9 @@ with tab3:
369
  # Get 10 samples from the dataset for translation
370
  samples_to_translate = st.session_state.data.sample(10)['sentence'].tolist()
371
 
372
- # # System prompt for translation assistant
373
- # translation_prompt = """
374
- # You are a helpful AI-powered translation assistant designed for users seeking reliable translation assistance. Your primary function is to provide context-aware translations from Moroccan Arabic (Darija) to English.
375
- # """
376
-
377
- # auto_translations = []
378
-
379
- # for sentence in samples_to_translate:
380
- # # Create messages for the chat model
381
- # messages = [
382
- # {"role": "system", "content": translation_prompt},
383
- # {"role": "user", "content": f"Translate the following sentence to English: '{sentence}'"}
384
- # ]
385
  # System prompt for translation assistant
386
- translation_system_prompt = """
387
- You are a native speaker of both Moroccan Arabic (Darija) and English. You are an expert of translations from Moroccan Arabic (Darija) into English.
388
  """
389
 
390
  auto_translations = []
@@ -392,8 +179,8 @@ with tab3:
392
  for sentence in samples_to_translate:
393
  # Create messages for the chat model
394
  messages = [
395
- {"role": "system", "content": translation_system_prompt},
396
- {"role": "user", "content": f"Translate the following sentence from Moroccan Arabic (Darija) to English, only return the translated sentence: '{sentence}'"}
397
  ]
398
 
399
  # Perform automatic translation using OpenAI GPT-3.5-turbo model
@@ -416,17 +203,10 @@ with tab3:
416
  'translation'
417
  ] = auto_translations
418
 
419
- # Append data to be saved
420
- append_translation_data(
421
- original=st.session_state.orig_sentence,
422
- translation=corrected_translation,
423
- translated=True,
424
- corrected=True
425
- )
426
-
427
 
428
  st.success("Auto-Translations saved!")
429
 
430
  else:
431
  st.warning("Please paste your OpenAI API key.")
432
-
 
1
  import os
2
  import time
 
3
  import random
 
4
  import pandas as pd
 
 
 
 
 
5
 
6
  import streamlit as st
7
+
8
  import huggingface_hub as hf
9
+ from huggingface_hub import login
10
+
11
+ import datasets
12
  from datasets import load_dataset
13
+
14
  import openai
15
  from openai import OpenAI
16
 
17
+
18
  # File Path
19
+ DATA_PATH = "Dr-En-space-test.csv"
20
+ DATA_REPO = "M-A-D/dar-en-space-test"
21
+
22
+ st.set_page_config(layout="wide")
23
 
24
  api = hf.HfApi()
25
  access_token_write = "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"
26
  login(token=access_token_write)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # Load data
29
  def load_data():
30
  return pd.DataFrame(load_dataset(DATA_REPO,download_mode="force_redownload",split='test'))
31
 
32
+ def save_data(data):
33
+ data.to_csv(DATA_PATH, index=False)
34
+ # to_save = datasets.Dataset.from_pandas(data)
35
+ api.upload_file(
36
+ path_or_fileobj="./Dr-En-space-test.csv",
37
+ path_in_repo="Dr-En-space-test.csv",
38
+ repo_id=DATA_REPO,
39
+ repo_type="dataset",
40
+ )
41
+ # to_save.push_to_hub(DATA_REPO)
42
 
43
  def skip_correction():
44
  noncorrected_sentences = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['sentence'].tolist()
 
49
  st.session_state.orig_sentence = "No more sentences to be corrected"
50
  st.session_state.orig_translation = "No more sentences to be corrected"
51
 
52
+ st.title("Darija Translation Corpus Collection")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  if "data" not in st.session_state:
55
  st.session_state.data = load_data()
 
76
  st.session_state.user_translation = ""
77
 
78
 
79
+ with st.sidebar:
80
+ st.subheader("About")
81
+ st.markdown("""This is app is designed to collect Darija translation corpus.""")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # tab1, tab2 = st.tabs(["Translation", "Correction"])
84
  tab1, tab2, tab3 = st.tabs(["Translation", "Correction", "Auto-Translate"])
85
 
86
  with tab1:
 
95
 
96
  if st.button("💾 Save"):
97
  if st.session_state.user_translation:
98
+ st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.sentence, 'translation'] = st.session_state.user_translation
99
+ st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.sentence, 'translated'] = True
100
+ save_data(st.session_state.data)
101
+
102
+ st.session_state.user_translation = "" # Reset the input value after saving
103
+
 
104
  # st.toast("Saved!", icon="👏")
105
  st.success("Saved!")
106
 
 
116
  # Rerun the app
117
  st.rerun()
118
 
 
119
  with tab2:
120
  with st.container():
121
  st.subheader("Original Darija Text:")
 
130
 
131
  if st.button("💾 Save Translation"):
132
  if corrected_translation:
133
+ st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.orig_sentence, 'translation'] = corrected_translation
134
+ st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.orig_sentence, 'correction'] = corrected_translation
135
+ st.session_state.data.loc[st.session_state.data['sentence'] == st.session_state.orig_sentence, 'corrected'] = True
136
+ save_data(st.session_state.data)
137
+
 
 
138
  st.success("Saved!")
139
 
140
  # Update the sentence for the next iteration.
 
156
 
157
  # User input for OpenAI API key
158
  openai_api_key = st.text_input("Paste your OpenAI API key:")
159
+
160
+ if st.button("Auto-Translate 10 Samples"):
 
 
 
 
 
 
 
161
  if openai_api_key:
162
  openai.api_key = openai_api_key
163
 
 
169
  # Get 10 samples from the dataset for translation
170
  samples_to_translate = st.session_state.data.sample(10)['sentence'].tolist()
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  # System prompt for translation assistant
173
+ translation_prompt = """
174
+ You are a helpful AI-powered translation assistant designed for users seeking reliable translation assistance. Your primary function is to provide context-aware translations from Moroccan Arabic (Darija) to English.
175
  """
176
 
177
  auto_translations = []
 
179
  for sentence in samples_to_translate:
180
  # Create messages for the chat model
181
  messages = [
182
+ {"role": "system", "content": translation_prompt},
183
+ {"role": "user", "content": f"Translate the following sentence to English: '{sentence}'"}
184
  ]
185
 
186
  # Perform automatic translation using OpenAI GPT-3.5-turbo model
 
203
  'translation'
204
  ] = auto_translations
205
 
206
+ # Save the updated dataset
207
+ save_data(st.session_state.data)
 
 
 
 
 
 
208
 
209
  st.success("Auto-Translations saved!")
210
 
211
  else:
212
  st.warning("Please paste your OpenAI API key.")