Yurii Paniv commited on
Commit
d36cdc2
1 Parent(s): cb99192

Add logging

Browse files
Files changed (2) hide show
  1. app.py +49 -2
  2. data_logger.py +41 -0
app.py CHANGED
@@ -13,6 +13,40 @@ from threading import Thread
13
  from torch import float16
14
  import spaces
15
  import huggingface_hub
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  config = PeftConfig.from_pretrained("lang-uk/dragoman")
@@ -37,9 +71,12 @@ tokenizer = AutoTokenizer.from_pretrained(
37
 
38
  @spaces.GPU(duration=30)
39
  def translate(input_text):
40
- generated_text = ""
41
  input_text = input_text.strip()
42
 
 
 
 
43
  input_text = f"[INST] {input_text} [/INST]"
44
  inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
45
 
@@ -78,6 +115,10 @@ desc_file = huggingface_hub.hf_hub_download("lang-uk/dragoman", "README.md")
78
  with open(desc_file, "r") as f:
79
  model_description = f.read()
80
  model_description = model_description[model_description.find("---", 1) + 5 :]
 
 
 
 
81
 
82
 
83
  iface = gr.Interface(
@@ -91,10 +132,16 @@ iface = gr.Interface(
91
  label="Translated sentence",
92
  ),
93
  examples=[
 
 
 
94
  [
95
  "ChatGPT (Chat Generative Pre-trained Transformer) is a chatbot developed by OpenAI and launched on November 30, 2022. Based on a large language model, it enables users to refine and steer a conversation towards a desired length, format, style, level of detail, and language. Successive prompts and replies, known as prompt engineering, are considered at each conversation stage as a context.[2] ",
 
 
96
  "who holds this neighborhood?",
97
- ]
 
98
  ],
99
  title="Dragoman: SOTA English-Ukrainian translation model",
100
  description='This demo contains a model from paper "Setting up the Data Printer with Improved English to Ukrainian Machine Translation", accepted to UNLP 2024 workshop at the LREC-COLING 2024 conference.',
 
13
  from torch import float16
14
  import spaces
15
  import huggingface_hub
16
+ from threading import Thread
17
+ from queue import Queue
18
+ from time import sleep
19
+ from os import getenv
20
+ from data_logger import log_data
21
+
22
+
23
+ def check_thread(logging_queue: Queue):
24
+ logging_callback = log_data(
25
+ hf_token=getenv("HF_API_TOKEN"),
26
+ dataset_name=getenv("OUTPUT_DATASET"),
27
+ private=True,
28
+ )
29
+ while True:
30
+ sleep(60)
31
+ batch = []
32
+ while not logging_queue.empty():
33
+ batch.append(logging_queue.get())
34
+
35
+ if len(batch) > 0:
36
+ try:
37
+ logging_callback(batch)
38
+ except:
39
+ print(
40
+ "Error happened while pushing data to HF. Puttting items back in queue..."
41
+ )
42
+ for item in batch:
43
+ logging_queue.put(item)
44
+
45
+
46
+ if getenv("HF_API_TOKEN") is not None:
47
+ log_queue = Queue()
48
+ t = Thread(target=check_thread, args=(log_queue,))
49
+ t.start()
50
 
51
 
52
  config = PeftConfig.from_pretrained("lang-uk/dragoman")
 
71
 
72
  @spaces.GPU(duration=30)
73
  def translate(input_text):
74
+ # generated_text = ""
75
  input_text = input_text.strip()
76
 
77
+ if getenv("HF_API_TOKEN") is not None:
78
+ log_queue.put([input_text])
79
+
80
  input_text = f"[INST] {input_text} [/INST]"
81
  inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
82
 
 
115
  with open(desc_file, "r") as f:
116
  model_description = f.read()
117
  model_description = model_description[model_description.find("---", 1) + 5 :]
118
+ model_description = (
119
+ """### By using this service, users are required to agree to the following terms: you agree that user input will be collected for future research and model improvements. \n\n"""
120
+ + model_description
121
+ )
122
 
123
 
124
  iface = gr.Interface(
 
132
  label="Translated sentence",
133
  ),
134
  examples=[
135
+ [
136
+ "How many leaves would it drop in a month of February in a non-leap year?",
137
+ ],
138
  [
139
  "ChatGPT (Chat Generative Pre-trained Transformer) is a chatbot developed by OpenAI and launched on November 30, 2022. Based on a large language model, it enables users to refine and steer a conversation towards a desired length, format, style, level of detail, and language. Successive prompts and replies, known as prompt engineering, are considered at each conversation stage as a context.[2] ",
140
+ ],
141
+ [
142
  "who holds this neighborhood?",
143
+ ],
144
+
145
  ],
146
  title="Dragoman: SOTA English-Ukrainian translation model",
147
  description='This demo contains a model from paper "Setting up the Data Printer with Improved English to Ukrainian Machine Translation", accepted to UNLP 2024 workshop at the LREC-COLING 2024 conference.',
data_logger.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio import utils
2
+ import os
3
+ import csv
4
+ import huggingface_hub
5
+
6
+
7
+ def log_data(hf_token: str, dataset_name: str, private=True):
8
+ path_to_dataset_repo = huggingface_hub.create_repo(
9
+ repo_id=dataset_name,
10
+ token=hf_token,
11
+ private=private,
12
+ repo_type="dataset",
13
+ exist_ok=True,
14
+ )
15
+ flagging_dir = "flagged"
16
+ dataset_dir = os.path.join(flagging_dir, dataset_name)
17
+ repo = huggingface_hub.Repository(
18
+ local_dir=dataset_dir,
19
+ clone_from=path_to_dataset_repo,
20
+ use_auth_token=hf_token,
21
+ )
22
+ repo.git_pull(lfs=True)
23
+ log_file = os.path.join(dataset_dir, "dragoman_logs.csv")
24
+
25
+ def log_function(data):
26
+ repo.git_pull(lfs=True)
27
+
28
+ with open(log_file, "a", newline="", encoding="utf-8") as csvfile:
29
+ writer = csv.writer(csvfile)
30
+
31
+ for row in data:
32
+ writer.writerow(utils.sanitize_list_for_csv(row))
33
+
34
+ with open(log_file, "r", encoding="utf-8") as csvfile:
35
+ line_count = len([None for row in csv.reader(csvfile)]) - 1
36
+
37
+ repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
38
+
39
+ return line_count
40
+
41
+ return log_function