vietdata commited on
Commit
b001ab7
1 Parent(s): 1adb751

first update

Browse files
Files changed (1) hide show
  1. app.py +131 -54
app.py CHANGED
@@ -3,14 +3,22 @@ from datasets import load_dataset, Dataset
3
  from collections import defaultdict
4
  import random
5
  import requests
6
- import os
 
 
 
7
  # Load the source dataset
8
  source_dataset = load_dataset("vietdata/eng_echo", split="train")
9
- source_texts = source_dataset["query"]
 
10
 
11
  # Initialize variables
12
- translations = defaultdict(list)
13
- processed_data = []
 
 
 
 
14
 
15
  def authenticate(user_id):
16
 
@@ -25,73 +33,120 @@ def authenticate(user_id):
25
 
26
  return response.status_code == 200
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Helper function to get the next text for translation
29
- def get_next_text(user_id):
30
- # Filter texts that already have 10 translations
31
- # eligible_texts = [text for text in source_texts if len(translations[text]) < 10]
32
- # if not eligible_texts:
33
- # return "All texts are fully translated."
34
-
35
- # Select a random eligible text for translation
36
- next_text = random.choice(source_texts)
37
  return next_text
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Function to handle translation submission
40
- def submit_translation(user_id, original_text, translation):
41
- # Check if text already has 10 translations
42
- if len(translations[original_text]) < 10:
43
- translations[original_text].append((user_id, translation))
 
44
 
45
- # Check if 100 texts have enough translations to save
46
- if len([t for t in translations if len(translations[t]) == 10]) >= 100:
47
- save_to_translated_echo()
 
48
 
49
- return "Translation submitted successfully."
50
- else:
51
- return "This text already has 10 translations. Please request a new text."
52
 
53
- # Function to save completed translations to 'translated_echo'
54
- def save_to_translated_echo():
55
- global translations, processed_data
56
 
57
- # Gather translations with exactly 10 versions
58
- completed_translations = [
59
- {"query": text, "translations": [t[1] for t in translations[text]]}
60
- for text in translations if len(translations[text]) == 10
61
- ]
62
 
63
- # Append to processed data
64
- processed_data.extend(completed_translations)
 
65
 
66
- # Reset translations
67
- translations = {text: val for text, val in translations.items() if len(val) < 10}
 
68
 
69
- # Convert to Hugging Face dataset format
70
- translated_dataset = Dataset.from_pandas(pd.DataFrame(processed_data))
 
71
 
 
 
 
 
 
 
 
 
 
 
72
  # Append to Hugging Face dataset (dummy function call)
 
73
  translated_dataset.push_to_hub("vietdata/translated_echo", split="train")
74
 
75
- import gradio as gr
 
 
 
 
 
 
76
 
77
- # Simulated user data for demonstration
78
- user_data = {"hello": "hello"}
79
 
80
  # Sample English text to translate
81
- english_text = "Translate this text to Vietnamese."
82
 
83
  # User session dictionary to store logged-in status
84
  user_sessions = {}
85
 
86
- def login(username, state):
87
  state[0] = username
88
-
 
89
  # Authenticate user
90
  if authenticate(username):
91
  #user_sessions[username] = True
92
- return f"Welcome, {username}!", gr.update(visible=False), gr.update(visible=True), get_next_text(username)
93
  else:
94
- return "Invalid username or password.", gr.update(visible=True), gr.update(visible=False), ""
95
 
96
  def logout(username):
97
  # Log out user and reset session
@@ -99,19 +154,28 @@ def logout(username):
99
  del user_sessions[username]
100
  return "Logged out. Please log in again.", gr.update(visible=True), gr.update(visible=False)
101
 
102
- def submit_translation(translation, state, job_input):
103
  try:
104
- submit_translation(state[0], job_input, translation)
105
- origin = job_input
106
  # Save the translation and provide feedback
107
- return f"""Translation of "{origin}" submitted: {translation}""", get_next_text(state[0])
 
108
  except Exception as e:
 
 
109
  print(e)
110
- return "Error please try submit again!", job_input
 
 
 
 
 
 
111
 
112
  # Define the Gradio interface
113
  with gr.Blocks() as demo:
114
  state = gr.State([None])
 
115
  # Login section
116
  with gr.Column(visible=True) as login_section:
117
  username_input = gr.Textbox(placeholder="Enter your token", label="Token ID")
@@ -120,18 +184,31 @@ with gr.Blocks() as demo:
120
 
121
  # Translation section (initially hidden)
122
  with gr.Column(visible=False) as translation_section:
123
- job_input = gr.Textbox(value=english_text, label="English Text", interactive=False)
124
- translation_input = gr.Textbox(placeholder="Enter your translation here", label="Your Translation")
125
- submit_button = gr.Button("Submit Translation")
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  translation_output = gr.Textbox(label="Submission Status", interactive=False)
127
  logout_button = gr.Button("Logout")
128
 
129
  # Button functions
130
  login_button.click(
131
- login, inputs=[username_input, state], outputs=[login_output, login_section, translation_section, job_input]
132
  )
133
  submit_button.click(
134
- submit_translation, inputs=[translation_input, state, job_input], outputs=[translation_output, job_input]
135
  )
136
  logout_button.click(
137
  logout, inputs=[username_input], outputs=[login_output, login_section, translation_section]
 
3
  from collections import defaultdict
4
  import random
5
  import requests
6
+ import os
7
+ from langdetect import detect
8
+ import pandas as pd
9
+
10
  # Load the source dataset
11
  source_dataset = load_dataset("vietdata/eng_echo", split="train")
12
+ eng_texts = list(set(source_dataset["query"] + source_dataset["positive"] + source_dataset["negative"]))
13
+ vi_texts = []
14
 
15
  # Initialize variables
16
+ envi_translations = []
17
+ vien_translations = []
18
+
19
+ trans2score = dict()
20
+ packages = [[0, "None", "None", 0, float('inf'), float("inf")]]
21
+ num = 1000
22
 
23
  def authenticate(user_id):
24
 
 
33
 
34
  return response.status_code == 200
35
 
36
+ def send_score(user_id, score):
37
+ max_retries = 10
38
+ while max_retries > 0:
39
+ url = "https://intern-api.imtaedu.com/api/subnets/1/grade"
40
+
41
+ payload = {
42
+ "token": user_id,
43
+ "comment": "Good job!",
44
+ "grade": score,
45
+ "submitted_at": "2021-01-01 00:00:00",
46
+ "graded_at": "2021-01-01 00:00:00"
47
+ }
48
+ headers = {
49
+ "Content-Type": "application/json",
50
+ "Accept": "application/json",
51
+ "X-Public-Api-Key": os.environ['ADMIN']
52
+ }
53
+
54
+ response = requests.post(url, json=payload, headers=headers)
55
+ if response.status_code == 200:
56
+ return True
57
+ print(response)
58
+ max_retries -= 1
59
+ return False
60
+
61
  # Helper function to get the next text for translation
62
+ def get_next_en_text(user_id):
63
+ next_text = random.choice(eng_texts)
 
 
 
 
 
 
64
  return next_text
65
 
66
+ def get_next_package(user_id):
67
+ if len(packages) == 0:
68
+ return None
69
+
70
+ save = False
71
+ count = 0
72
+ for i in range(1, len(packages)):
73
+ if count >= num:
74
+ save_to_translated_echo()
75
+ return packages[0]
76
+ if packages[i][-2] > 0 :#and packages[i][0] != user_id:
77
+ packages[0][-2] -= 1
78
+ return packages[i]
79
+ if packages[i][-2] == 0 and packages[i][-2] == packages[i][-1]:
80
+ count += 1
81
+ return packages[0]
82
+
83
  # Function to handle translation submission
84
+ def submit_translation(user_id, package, vi_translation, en_text, en_translation, vi_text):
85
+ assert vi_translation != ""
86
+ if vi_translation != "" and detect(vi_translation) != "vi":
87
+ gr.Warning("Bản dịch không phải tiếng Việt", duration=5)
88
+ assert 4==5
89
 
90
+ if en_translation != "" and detect(en_translation) != "en":
91
+ print(en_translation, detect(en_translation))
92
+ gr.Warning("Bản dịch không phải tiếng Anh", duration=5)
93
+ assert 4==5
94
 
95
+ first_score = gg_score(en_text, vi_translation, target="vi")
 
 
96
 
 
 
 
97
 
 
 
 
 
 
98
 
99
+ second_score = miner_score(package[0][1], en_translation)
100
+ ref_score = gg_score(package[0][2], en_translation, target="en")
101
+ trust_score = 1 - abs(second_score - ref_score)/max((second_score+ref_score)/2, 0.1)
102
 
103
+ packages.append([user_id, en_text, vi_translation, first_score*trust_score*0.5, 10, 10])
104
+ package[0][3] += second_score*trust_score*0.05
105
+ package[0][-1] -= 1
106
 
107
+ assert send_score(user_id, first_score*trust_score*0.5)
108
+ if package[0][0] != 0:
109
+ assert send_score(package[0][0], second_score*trust_score*0.05)
110
 
111
+ # Function to save completed translations to 'translated_echo'
112
+ def save_to_translated_echo():
113
+ try:
114
+ old_dataset = load_dataset("vietdata/translated_echo", split="train")
115
+ old_dataset = old_dataset.to_pandas()
116
+ except:
117
+ old_dataset = pd.DataFrame([], columns=["user_id", "source", "target", "score"])
118
+
119
+ new_dataset = pd.DataFrame([i[:4] for i in packages[:num]], columns=["user_id", "source", "target", "score"])
120
+ new_dataset = pd.concat([old_dataset, new_dataset])
121
  # Append to Hugging Face dataset (dummy function call)
122
+ translated_dataset = Dataset.from_pandas(new_dataset)
123
  translated_dataset.push_to_hub("vietdata/translated_echo", split="train")
124
 
125
+ del new_dataset
126
+ del old_dataset
127
+ del translated_dataset
128
+ import gc
129
+ gc.collect()
130
+ for i in range(num):
131
+ packages.pop(1)
132
 
 
 
133
 
134
  # Sample English text to translate
135
+ english_text = None
136
 
137
  # User session dictionary to store logged-in status
138
  user_sessions = {}
139
 
140
+ def login(username, state, package):
141
  state[0] = username
142
+ package[0] = get_next_package(user_id=username)
143
+
144
  # Authenticate user
145
  if authenticate(username):
146
  #user_sessions[username] = True
147
+ return f"Welcome, {username}!", gr.update(visible=False), gr.update(visible=True), get_next_en_text(username), package[0][2]
148
  else:
149
+ return "Invalid username or password.", gr.update(visible=True), gr.update(visible=False), "", ""
150
 
151
  def logout(username):
152
  # Log out user and reset session
 
154
  del user_sessions[username]
155
  return "Logged out. Please log in again.", gr.update(visible=True), gr.update(visible=False)
156
 
157
+ def press_submit_translation( state, package, vi_translation, en_input, en_translation, vi_input):
158
  try:
159
+ submit_translation(state[0], package, vi_translation, en_input, en_translation, vi_input)
 
160
  # Save the translation and provide feedback
161
+ gr.Info("Submitted Succesfully")
162
+
163
  except Exception as e:
164
+ import traceback
165
+ print(traceback.format_exc())
166
  print(e)
167
+ return "Error please try submit again!", en_input, vi_input, "", ""
168
+
169
+ try:
170
+ package[0] = get_next_package(user_id=state[0])
171
+ return f"""Submitted Succesfully""", get_next_en_text(state[0]), package[0][2], "", ""
172
+ except:
173
+ return "Failed to load new job, please reload page!", en_input, vi_input, "", ""
174
 
175
  # Define the Gradio interface
176
  with gr.Blocks() as demo:
177
  state = gr.State([None])
178
+ package = gr.State([None])
179
  # Login section
180
  with gr.Column(visible=True) as login_section:
181
  username_input = gr.Textbox(placeholder="Enter your token", label="Token ID")
 
184
 
185
  # Translation section (initially hidden)
186
  with gr.Column(visible=False) as translation_section:
187
+ with gr.Column() as en2vi:
188
+ gr.Markdown("### Dịch từ tiếng Anh sang tiếng Việt")
189
+ en_input = gr.Textbox(value=english_text, label="Văn bản tiếng Anh", interactive=False)
190
+ vi_translation_input = gr.Textbox(placeholder="Nhập bản dịch", label="Nhập bản dịch tiếng Việt")
191
+
192
+ with gr.Column() as en2vi:
193
+ gr.Markdown("### Dịch từ tiếng Việt sang tiếng Anh")
194
+ vi_input = gr.Textbox(value=english_text, label="Văn bản tiếng Việt", interactive=False)
195
+ en_translation_input = gr.Textbox(placeholder="Nhập bản dịch", label="Nhập bản dịch tiếng Anh")
196
+
197
+ # gr.Markdown("### Đây là văn bản máy dịch hay người dịch (kiểm tra độ tự nhiên của văn bản)")
198
+ # with gr.Row():
199
+ # eval_document = gr.Textbox(label="Văn bản", placeholder="Văn bản cần đánh giá", interactive=False)
200
+ # choice = gr.Radio(["Human-Written", "Machine-Translated"], label="How would you classify this response?")
201
+
202
+ submit_button = gr.Button("Submit")
203
  translation_output = gr.Textbox(label="Submission Status", interactive=False)
204
  logout_button = gr.Button("Logout")
205
 
206
  # Button functions
207
  login_button.click(
208
+ login, inputs=[username_input, state, package], outputs=[login_output, login_section, translation_section, en_input, vi_input]
209
  )
210
  submit_button.click(
211
+ press_submit_translation, inputs=[state, package, vi_translation_input, en_input, en_translation_input, vi_input], outputs=[translation_output, en_input, vi_input, vi_translation_input, en_translation_input]
212
  )
213
  logout_button.click(
214
  logout, inputs=[username_input], outputs=[login_output, login_section, translation_section]