Spaces:

Kirili4ik
/

chat-with-Kirill

Runtime error

App Files Files Community

Kirili4ik commited on Oct 25, 2021

Commit

21a5dba

•

1 Parent(s): 5246f84

clean and make 6ep model

Browse files

Files changed (2) hide show

app.py +18 -136
util_funcs.py +109 -0

app.py CHANGED Viewed

@@ -1,118 +1,7 @@
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-def get_length_param(text: str, tokenizer) -> str:
-    """Maps text to 1 of 4 buckets based on length after encoding.
-    Parameters
-    ----------
-    text: str
-        The text to be given 1 of 4 length parameters.
-    tokenizer: HuggingFace tokenizer
-        Tokenizer that used to compute the length of the text after encoding.
-        For more info ee https://huggingface.co/transformers/main_classes/tokenizer.html
-    Returns
-    -------
-    len_param: str
-        One of four buckets:
-        '1' for short, '2' for medium, '3' for long texts and '-' for all others.
-    """
-    tokens_count = len(tokenizer.encode(text))
-    if tokens_count <= 15:
-        len_param = '1'
-    elif tokens_count <= 50:
-        len_param = '2'
-    elif tokens_count <= 256:
-        len_param = '3'
-    else:
-        len_param = '-'
-    return len_param
-def get_user_param(text: dict, machine_name_in_chat: str) -> str:
-    """Maps text by 1/0 for it to be the person or the machine in the dialogue
-    Parameters
-    ----------
-    text: Dict[..., 'from', ...]
-        Dict containing field 'from' with the name of the user who sent the message
-    machine_name_in_chat: str
-        Str with the name of the machine - it will be predicted
-    """
-    if text['from'] == machine_name_in_chat:
-        return '1'  # machine
-    else:
-        return '0'  # human
-def build_text_file(data_json: dict, dest_path: str,
-                    tokenizer, machine_name_in_chat='Кирилл Гельван'):
-    """Create a text file for training in special format for ruDialoGPT-3.
-    Parameters
-    ----------
-    data_json: dict
-        Dict containing 'text' (message) and 'from' (user who sent the message)
-    dest_path: str
-        String containing path to write data there
-    tokenizer: HuggingFace tokenizer
-        Tokenizer that used to compute the length of the text after encoding.
-        For more info ee https://huggingface.co/transformers/main_classes/tokenizer.html
-    """
-    f = open(dest_path, 'w')
-    new_data = ''
-    for i in range(len(data_json) - 1):
-        message, next_message = data_json[i], data_json[i+1]
-        if message['text'] == '' or type(message['text']) != str:
-            continue
-        if next_message['text'] == '' or type(next_message['text']) != str:
-            continue
-        user   = get_user_param(message, machine_name_in_chat=machine_name_in_chat)
-        length = get_length_param(data_json[i+1]['text'], tokenizer)
-        message_text = re.sub(r"\n", ". ", message['text'])
-        new_data += f"|{user}|{length}|{message_text}{tokenizer.eos_token}" + "\n"
-    f.write(new_data)
-def load_dataset(train_path, test_path, tokenizer):
-    """Creates train and test PyTorch datasets and collate_fn using HuggingFace.
-    Parameters
-    ----------
-    train_path: str
-        String containing path to train data
-    test_path: str
-        String containing path to test data
-    tokenizer: HuggingFace tokenizer
-        Tokenizer that used to compute the length of the text after encoding.
-        For more info ee https://huggingface.co/transformers/main_classes/tokenizer.html
-    """
-    train_dataset = TextDataset(
-          tokenizer  = tokenizer,
-          file_path  = train_path,
-          block_size = 256)
-    test_dataset = TextDataset(
-          tokenizer  = tokenizer,
-          file_path  = test_path,
-          block_size = 256)
-    data_collator = DataCollatorForLanguageModeling(
-        tokenizer=tokenizer, mlm=False
-    )
-    return train_dataset, test_dataset, data_collator
 def chat_function(message, length_of_the_answer, who_is_next, creativity):   # model, tokenizer
@@ -138,12 +27,6 @@ def chat_function(message, length_of_the_answer, who_is_next, creativity):   # m
     history = gr.get_state() or []
     chat_history_ids = torch.zeros((1, 0), dtype=torch.int) if history == [] else torch.tensor(history[-1][2], dtype=torch.long)
-    #########     next_who = input("Who's phrase?\t")  #input("H / G?")     # Human or GPT
-    # In case Human
-    ##### if next_who == "H":
-    ########    input_user = input("===> Human: ")
     # encode the new user input, add parameters and return a tensor in Pytorch
     if len(input_user) != 0:
@@ -156,7 +39,6 @@ def chat_function(message, length_of_the_answer, who_is_next, creativity):   # m
     if next_who == "G":
-        ######## next_len = input("Phrase len? 1/2/3/-\t")  #input("Exp. len?(-/1/2/3): ")
         # encode the new user input, add parameters and return a tensor in Pytorch
         new_user_input_ids = tokenizer.encode(f"|1|{next_len}|", return_tensors="pt")
         # append the new user input tokens to the chat history
@@ -198,45 +80,45 @@ def chat_function(message, length_of_the_answer, who_is_next, creativity):   # m
             html += f"<div class='resp_msg'>{resp_msg}</div>"
     html += "</div>"
     return html
 # Download checkpoint:
-checkpoint = "Kirili4ik/ruDialoGpt3-medium-finetuned-telegram"
 tokenizer =  AutoTokenizer.from_pretrained(checkpoint)
 model = AutoModelForCausalLM.from_pretrained(checkpoint)
 model = model.eval()
 checkbox_group = gr.inputs.CheckboxGroup(['Kirill', 'Me'], default=['Kirill'], type="value", label=None)
-inputs = gr.inputs.Textbox(lines=1, label="???")
-outputs =  gr.outputs.Textbox(label="Kirill (GPT-2):")
 title = "Chat with Kirill (in Russian)"
 description = "Тут можно поболтать со мной. Но вместо меня бот. Оставь message пустым, чтобы Кирилл продолжил говорить. Подбробнее о технике по ссылке внизу."
 article = "<p style='text-align: center'><a href='https://github.com/Kirili4ik/ruDialoGpt3-finetune-colab'>Github with fine-tuning GPT-2 on your chat</a></p>"
 examples = [
-            ["Привет, как дела?", 'medium', 'Kirill', 0.6],
             ["Сколько тебе лет?", 'medium', 'Kirill', 0.3],
 ]
-iface = gr.Interface(chat_function,
-                     [
-                         "text",
-                         gr.inputs.Radio(["short", "medium", "long"], default='medium'),
                          gr.inputs.Radio(["Kirill", "Me"], default='Kirill'),
-                         gr.inputs.Slider(0, 1, default=0.6)
-                     ],
-                     "html",
                      title=title, description=description, article=article, examples=examples,
                      css= """
                             .chatbox {display:flex;flex-direction:column}
                             .user_msg, .resp_msg {padding:4px;margin-bottom:4px;border-radius:4px;width:80%}
                             .user_msg {background-color:cornflowerblue;color:white;align-self:start}
                             .resp_msg {background-color:lightgray;align-self:self-end}
-                          """,
-                     allow_screenshot=True,
                      allow_flagging=False
                     )
-iface.launch()

 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from util_funcs import get_length_param
 def chat_function(message, length_of_the_answer, who_is_next, creativity):   # model, tokenizer
     history = gr.get_state() or []
     chat_history_ids = torch.zeros((1, 0), dtype=torch.int) if history == [] else torch.tensor(history[-1][2], dtype=torch.long)
     # encode the new user input, add parameters and return a tensor in Pytorch
     if len(input_user) != 0:
     if next_who == "G":
         # encode the new user input, add parameters and return a tensor in Pytorch
         new_user_input_ids = tokenizer.encode(f"|1|{next_len}|", return_tensors="pt")
         # append the new user input tokens to the chat history
             html += f"<div class='resp_msg'>{resp_msg}</div>"
     html += "</div>"
     return html
 # Download checkpoint:
+checkpoint = "Kirili4ik/ruDialoGpt3-medium-finetuned-telegram-6ep"
 tokenizer =  AutoTokenizer.from_pretrained(checkpoint)
 model = AutoModelForCausalLM.from_pretrained(checkpoint)
 model = model.eval()
+# Gradio
 checkbox_group = gr.inputs.CheckboxGroup(['Kirill', 'Me'], default=['Kirill'], type="value", label=None)
 title = "Chat with Kirill (in Russian)"
 description = "Тут можно поболтать со мной. Но вместо меня бот. Оставь message пустым, чтобы Кирилл продолжил говорить. Подбробнее о технике по ссылке внизу."
 article = "<p style='text-align: center'><a href='https://github.com/Kirili4ik/ruDialoGpt3-finetune-colab'>Github with fine-tuning GPT-2 on your chat</a></p>"
 examples = [
+            ["Привет, как дела?", 'medium', 'Kirill', 0.5],
             ["Сколько тебе лет?", 'medium', 'Kirill', 0.3],
 ]
+iface = gr.Interface(chat_function,
+                     [
+                         "text",
+                         gr.inputs.Radio(["short", "medium", "long"], default='medium'),
                          gr.inputs.Radio(["Kirill", "Me"], default='Kirill'),
+                         gr.inputs.Slider(0, 1, default=0.5)
+                     ],
+                     "html",
                      title=title, description=description, article=article, examples=examples,
                      css= """
                             .chatbox {display:flex;flex-direction:column}
                             .user_msg, .resp_msg {padding:4px;margin-bottom:4px;border-radius:4px;width:80%}
                             .user_msg {background-color:cornflowerblue;color:white;align-self:start}
                             .resp_msg {background-color:lightgray;align-self:self-end}
+                          """,
+                     allow_screenshot=True,
                      allow_flagging=False
                     )
+if __name__ == "__main__":
+    iface.launch()

util_funcs.py ADDED Viewed

	@@ -0,0 +1,109 @@

+def get_length_param(text: str, tokenizer) -> str:
+    """Maps text to 1 of 4 buckets based on length after encoding.
+    Parameters
+    ----------
+    text: str
+        The text to be given 1 of 4 length parameters.
+    tokenizer: HuggingFace tokenizer
+        Tokenizer that used to compute the length of the text after encoding.
+        For more info ee https://huggingface.co/transformers/main_classes/tokenizer.html
+    Returns
+    -------
+    len_param: str
+        One of four buckets:
+        '1' for short, '2' for medium, '3' for long texts and '-' for all others.
+    """
+    tokens_count = len(tokenizer.encode(text))
+    if tokens_count <= 15:
+        len_param = '1'
+    elif tokens_count <= 50:
+        len_param = '2'
+    elif tokens_count <= 256:
+        len_param = '3'
+    else:
+        len_param = '-'
+    return len_param
+def get_user_param(text: dict, machine_name_in_chat: str) -> str:
+    """Maps text by 1/0 for it to be the person or the machine in the dialogue
+    Parameters
+    ----------
+    text: Dict[..., 'from', ...]
+        Dict containing field 'from' with the name of the user who sent the message
+    machine_name_in_chat: str
+        Str with the name of the machine - it will be predicted
+    """
+    if text['from'] == machine_name_in_chat:
+        return '1'  # machine
+    else:
+        return '0'  # human
+def build_text_file(data_json: dict, dest_path: str,
+                    tokenizer, machine_name_in_chat='Кирилл Гельван'):
+    """Create a text file for training in special format for ruDialoGPT-3.
+    Parameters
+    ----------
+    data_json: dict
+        Dict containing 'text' (message) and 'from' (user who sent the message)
+    dest_path: str
+        String containing path to write data there
+    tokenizer: HuggingFace tokenizer
+        Tokenizer that used to compute the length of the text after encoding.
+        For more info ee https://huggingface.co/transformers/main_classes/tokenizer.html
+    """
+    f = open(dest_path, 'w')
+    new_data = ''
+    for i in range(len(data_json) - 1):
+        message, next_message = data_json[i], data_json[i+1]
+        if message['text'] == '' or type(message['text']) != str:
+            continue
+        if next_message['text'] == '' or type(next_message['text']) != str:
+            continue
+        user   = get_user_param(message, machine_name_in_chat=machine_name_in_chat)
+        length = get_length_param(data_json[i+1]['text'], tokenizer)
+        message_text = re.sub(r"\n", ". ", message['text'])
+        new_data += f"|{user}|{length}|{message_text}{tokenizer.eos_token}" + "\n"
+    f.write(new_data)
+def load_dataset(train_path, test_path, tokenizer):
+    """Creates train and test PyTorch datasets and collate_fn using HuggingFace.
+    Parameters
+    ----------
+    train_path: str
+        String containing path to train data
+    test_path: str
+        String containing path to test data
+    tokenizer: HuggingFace tokenizer
+        Tokenizer that used to compute the length of the text after encoding.
+        For more info ee https://huggingface.co/transformers/main_classes/tokenizer.html
+    """
+    train_dataset = TextDataset(
+          tokenizer  = tokenizer,
+          file_path  = train_path,
+          block_size = 256)
+    test_dataset = TextDataset(
+          tokenizer  = tokenizer,
+          file_path  = test_path,
+          block_size = 256)
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False
+    )
+    return train_dataset, test_dataset, data_collator