Spaces:

lang-uk
/

dragoman

Sleeping

App Files Files Community

Yurii Paniv commited on Apr 17

Commit

d36cdc2

•

1 Parent(s): cb99192

Add logging

Browse files

Files changed (2) hide show

app.py +49 -2
data_logger.py +41 -0

app.py CHANGED Viewed

@@ -13,6 +13,40 @@ from threading import Thread
 from torch import float16
 import spaces
 import huggingface_hub
 config = PeftConfig.from_pretrained("lang-uk/dragoman")
@@ -37,9 +71,12 @@ tokenizer = AutoTokenizer.from_pretrained(
 @spaces.GPU(duration=30)
 def translate(input_text):
-    generated_text = ""
     input_text = input_text.strip()
     input_text = f"[INST] {input_text} [/INST]"
     inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
@@ -78,6 +115,10 @@ desc_file = huggingface_hub.hf_hub_download("lang-uk/dragoman", "README.md")
 with open(desc_file, "r") as f:
     model_description = f.read()
     model_description = model_description[model_description.find("---", 1) + 5 :]
 iface = gr.Interface(
@@ -91,10 +132,16 @@ iface = gr.Interface(
         label="Translated sentence",
     ),
     examples=[
         [
             "ChatGPT (Chat Generative Pre-trained Transformer) is a chatbot developed by OpenAI and launched on November 30, 2022. Based on a large language model, it enables users to refine and steer a conversation towards a desired length, format, style, level of detail, and language. Successive prompts and replies, known as prompt engineering, are considered at each conversation stage as a context.[2] ",
             "who holds this neighborhood?",
-        ]
     ],
     title="Dragoman: SOTA English-Ukrainian translation model",
     description='This demo contains a model from paper "Setting up the Data Printer with Improved English to Ukrainian Machine Translation", accepted to UNLP 2024 workshop at the LREC-COLING 2024 conference.',

 from torch import float16
 import spaces
 import huggingface_hub
+from threading import Thread
+from queue import Queue
+from time import sleep
+from os import getenv
+from data_logger import log_data
+def check_thread(logging_queue: Queue):
+    logging_callback = log_data(
+        hf_token=getenv("HF_API_TOKEN"),
+        dataset_name=getenv("OUTPUT_DATASET"),
+        private=True,
+    )
+    while True:
+        sleep(60)
+        batch = []
+        while not logging_queue.empty():
+            batch.append(logging_queue.get())
+        if len(batch) > 0:
+            try:
+                logging_callback(batch)
+            except:
+                print(
+                    "Error happened while pushing data to HF. Puttting items back in queue..."
+                )
+                for item in batch:
+                    logging_queue.put(item)
+if getenv("HF_API_TOKEN") is not None:
+    log_queue = Queue()
+    t = Thread(target=check_thread, args=(log_queue,))
+    t.start()
 config = PeftConfig.from_pretrained("lang-uk/dragoman")
 @spaces.GPU(duration=30)
 def translate(input_text):
+    # generated_text = ""
     input_text = input_text.strip()
+    if getenv("HF_API_TOKEN") is not None:
+        log_queue.put([input_text])
     input_text = f"[INST] {input_text} [/INST]"
     inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
 with open(desc_file, "r") as f:
     model_description = f.read()
     model_description = model_description[model_description.find("---", 1) + 5 :]
+    model_description = (
+        """### By using this service, users are required to agree to the following terms: you agree that user input will be collected for future research and model improvements. \n\n"""
+        + model_description
+    )
 iface = gr.Interface(
         label="Translated sentence",
     ),
     examples=[
+        [
+            "How many leaves would it drop in a month of February in a non-leap year?",
+        ],
         [
             "ChatGPT (Chat Generative Pre-trained Transformer) is a chatbot developed by OpenAI and launched on November 30, 2022. Based on a large language model, it enables users to refine and steer a conversation towards a desired length, format, style, level of detail, and language. Successive prompts and replies, known as prompt engineering, are considered at each conversation stage as a context.[2] ",
+        ],
+        [
             "who holds this neighborhood?",
+        ],
     ],
     title="Dragoman: SOTA English-Ukrainian translation model",
     description='This demo contains a model from paper "Setting up the Data Printer with Improved English to Ukrainian Machine Translation", accepted to UNLP 2024 workshop at the LREC-COLING 2024 conference.',

data_logger.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from gradio import utils
+import os
+import csv
+import huggingface_hub
+def log_data(hf_token: str, dataset_name: str, private=True):
+    path_to_dataset_repo = huggingface_hub.create_repo(
+        repo_id=dataset_name,
+        token=hf_token,
+        private=private,
+        repo_type="dataset",
+        exist_ok=True,
+    )
+    flagging_dir = "flagged"
+    dataset_dir = os.path.join(flagging_dir, dataset_name)
+    repo = huggingface_hub.Repository(
+        local_dir=dataset_dir,
+        clone_from=path_to_dataset_repo,
+        use_auth_token=hf_token,
+    )
+    repo.git_pull(lfs=True)
+    log_file = os.path.join(dataset_dir, "dragoman_logs.csv")
+    def log_function(data):
+        repo.git_pull(lfs=True)
+        with open(log_file, "a", newline="", encoding="utf-8") as csvfile:
+            writer = csv.writer(csvfile)
+            for row in data:
+                writer.writerow(utils.sanitize_list_for_csv(row))
+        with open(log_file, "r", encoding="utf-8") as csvfile:
+            line_count = len([None for row in csv.reader(csvfile)]) - 1
+        repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
+        return line_count
+    return log_function