Spaces:

computergibs
/

PythonHelper

Running

App Files Files Community

AlexanderK commited on Jun 8, 2023

Commit

7c04653

1 Parent(s): 983571f

new huggingface space

Browse files

Files changed (20) hide show

main_app.py +50 -0
python-code-explainer/README.md +77 -0
python-code-explainer/config.json +61 -0
python-code-explainer/merges.txt +0 -0
python-code-explainer/model.safetensors +3 -0
python-code-explainer/pytorch_model.bin +3 -0
python-code-explainer/special_tokens_map.json +753 -0
python-code-explainer/tokenizer.json +0 -0
python-code-explainer/tokenizer_config.json +65 -0
python-code-explainer/vocab.json +0 -0
requirements.txt +5 -0
translator.py +17 -0
wmt19-en-ru/README.md +109 -0
wmt19-en-ru/config.json +37 -0
wmt19-en-ru/generation_config.json +12 -0
wmt19-en-ru/merges.txt +0 -0
wmt19-en-ru/pytorch_model.bin +3 -0
wmt19-en-ru/tokenizer_config.json +7 -0
wmt19-en-ru/vocab-src.json +0 -0
wmt19-en-ru/vocab-tgt.json +0 -0

main_app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import gradio as gr
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoConfig,
+    pipeline,
+)
+import torch
+from translator import translate_text  # импортируем функцию переводчика
+model_name = "sagard21/python-code-explainer"
+tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+config = AutoConfig.from_pretrained(model_name)
+if torch.cuda.is_available():
+    model = model.to('cuda')  # запускаем модель на GPU, если доступно
+model.eval()
+pipe = pipeline("summarization", model=model_name, config=config, tokenizer=tokenizer)
+def generate_text(text_prompt):
+    response = pipe(text_prompt)
+    english_explanation = response[0]['summary_text']
+    russian_explanation = translate_text(english_explanation)  # переводим объяснение кода с англ на рус язык
+    return english_explanation, russian_explanation
+textbox1 = gr.Textbox(value="""
+class Solution(object):
+    def isValid(self, s):
+        stack = []
+        mapping = {")": "(", "}": "{", "]": "["}
+        for char in s:
+            if char in mapping:
+                top_element = stack.pop() if stack else '#'
+                if mapping[char] != top_element:
+                    return False
+            else:
+                stack.append(char)
+        return not stack""")
+textbox2 = gr.Textbox()
+textbox3 = gr.Textbox()
+if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        gr.Interface(fn=generate_text, inputs=textbox1, outputs=[textbox2, textbox3])
+    demo.launch()  # запускаем Gradio-интерфейс

python-code-explainer/README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+---
+tags:
+- autotrain
+- summarization
+language:
+- en
+widget:
+- text: >
+    def preprocess(text: str) -> str:
+        text = str(text)
+        text = text.replace('\\n', ' ')
+        tokenized_text = text.split(' ')
+        preprocessed_text = " ".join([token for token in tokenized_text if token])
+        return preprocessed_text
+datasets:
+- sagard21/autotrain-data-code-explainer
+co2_eq_emissions:
+  emissions: 5.393079045128973
+license: mit
+pipeline_tag: summarization
+---
+# Model Trained Using AutoTrain
+- Problem type: Summarization
+- Model ID: 2745581349
+- CO2 Emissions (in grams): 5.3931
+# Model Description
+This model is an attempt to simplify code understanding by generating line by line explanation of a source code. This model was fine-tuned using the Salesforce/codet5-large model. Currently it is trained on a small subset of Python snippets.
+# Model Usage
+```py
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoConfig,
+    pipeline,
+)
+model_name = "sagard21/python-code-explainer"
+tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+config = AutoConfig.from_pretrained(model_name)
+model.eval()
+pipe = pipeline("summarization", model=model_name, config=config, tokenizer=tokenizer)
+raw_code = """
+def preprocess(text: str) -> str:
+    text = str(text)
+    text = text.replace("\n", " ")
+    tokenized_text = text.split(" ")
+    preprocessed_text = " ".join([token for token in tokenized_text if token])
+    return preprocessed_text
+"""
+print(pipe(raw_code)[0]["summary_text"])
+```
+## Validation Metrics
+- Loss: 2.156
+- Rouge1: 29.375
+- Rouge2: 18.128
+- RougeL: 25.445
+- RougeLsum: 28.084
+- Gen Len: 19.000

python-code-explainer/config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "_name_or_path": "AutoTrain",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "d_ff": 4096,
+  "d_kv": 64,
+  "d_model": 1024,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 24,
+  "num_heads": 16,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "use_cache": true,
+  "vocab_size": 32100
+}

python-code-explainer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

python-code-explainer/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b8a32ab413be2b42ac6a21ac09453c1193a621a6b9a270d6be67f16e58ec00c
+size 2950619850

python-code-explainer/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8eaf24cf06354f8adf76f137f0319cfb618cfd2abb19496c94fd3d3a9e08404
+size 2950733825

python-code-explainer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,753 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<extra_id_99>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_98>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_97>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_96>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_95>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_94>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_93>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_92>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_91>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_90>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_89>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_88>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_87>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_86>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_85>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_84>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_83>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_82>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_81>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_80>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_79>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_78>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_77>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_76>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_75>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_74>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_73>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_72>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_71>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_70>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_69>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_68>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_67>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_66>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_65>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_64>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_63>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_62>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_61>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_60>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_59>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_58>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_57>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_56>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_55>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_54>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_53>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_52>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_51>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_50>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_49>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_48>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_47>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_46>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_45>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_44>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_43>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_42>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_41>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_40>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_39>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_38>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_37>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_36>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_35>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_34>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_33>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_32>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_31>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_30>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_29>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_28>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_27>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_26>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_25>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_24>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_23>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_22>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_21>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_20>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_19>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_18>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_17>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_16>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_15>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_14>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_13>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_12>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_11>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_10>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_9>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_8>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_7>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_6>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_5>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_4>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_3>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_2>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_1>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_0>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

python-code-explainer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

python-code-explainer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "name_or_path": "AutoTrain",
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "special_tokens_map_file": "/export/home/cache/model/5941df5e4315c5ab63b7b2ac791fb0bf0f209744a055c06b43b5274849137cdd.b9905d0575bde443a20834122b6e2d48e853b2e36444ce98ddeb43c38097eb3f",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

python-code-explainer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+gradio
+sacremoses
+flake8

translator.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from transformers import FSMTForConditionalGeneration, FSMTTokenizer
+MODEL_NAME = "facebook/wmt19-en-ru"
+# инициализируем токенизатор
+tokenizer = FSMTTokenizer.from_pretrained(MODEL_NAME)
+# инициализируем модель для генерации условий
+model = FSMTForConditionalGeneration.from_pretrained(MODEL_NAME)
+# определяем функцию для перевода текста
+def translate_text(input_text):
+    input_ids = tokenizer.encode(input_text, return_tensors="pt")
+    outputs = model.generate(input_ids)
+    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return decoded

wmt19-en-ru/README.md ADDED Viewed

	@@ -0,0 +1,109 @@

+---
+language:
+- en
+- ru
+tags:
+- translation
+- wmt19
+- facebook
+license: apache-2.0
+datasets:
+- wmt19
+metrics:
+- bleu
+thumbnail: https://huggingface.co/front/thumbnails/facebook.png
+---
+# FSMT
+## Model description
+This is a ported version of [fairseq wmt19 transformer](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md) for en-ru.
+For more details, please see, [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616).
+The abbreviation FSMT stands for FairSeqMachineTranslation
+All four models are available:
+* [wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru)
+* [wmt19-ru-en](https://huggingface.co/facebook/wmt19-ru-en)
+* [wmt19-en-de](https://huggingface.co/facebook/wmt19-en-de)
+* [wmt19-de-en](https://huggingface.co/facebook/wmt19-de-en)
+## Intended uses & limitations
+#### How to use
+```python
+from transformers import FSMTForConditionalGeneration, FSMTTokenizer
+mname = "facebook/wmt19-en-ru"
+tokenizer = FSMTTokenizer.from_pretrained(mname)
+model = FSMTForConditionalGeneration.from_pretrained(mname)
+input = "Machine learning is great, isn't it?"
+input_ids = tokenizer.encode(input, return_tensors="pt")
+outputs = model.generate(input_ids)
+decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(decoded) # Машинное обучение - это здорово, не так ли?
+```
+#### Limitations and bias
+- The original (and this ported model) doesn't seem to handle well inputs with repeated sub-phrases, [content gets truncated](https://discuss.huggingface.co/t/issues-with-translating-inputs-containing-repeated-phrases/981)
+## Training data
+Pretrained weights were left identical to the original model released by fairseq. For more details, please, see the [paper](https://arxiv.org/abs/1907.06616).
+## Eval results
+pair   | fairseq | transformers
+-------|---------|----------
+en-ru  | [36.4](http://matrix.statmt.org/matrix/output/1914?run_id=6724) | 33.47
+The score is slightly below the score reported by `fairseq`, since `transformers`` currently doesn't support:
+- model ensemble, therefore the best performing checkpoint was ported (``model4.pt``).
+- re-ranking
+The score was calculated using this code:
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+export PAIR=en-ru
+export DATA_DIR=data/$PAIR
+export SAVE_DIR=data/$PAIR
+export BS=8
+export NUM_BEAMS=15
+mkdir -p $DATA_DIR
+sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
+sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
+echo $PAIR
+PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
+```
+note: fairseq reports using a beam of 50, so you should get a slightly higher score if re-run with `--num_beams 50`.
+## Data Sources
+- [training, etc.](http://www.statmt.org/wmt19/)
+- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
+### BibTeX entry and citation info
+```bibtex
+@inproceedings{...,
+  year={2020},
+  title={Facebook FAIR's WMT19 News Translation Task Submission},
+  author={Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey},
+  booktitle={Proc. of WMT},
+}
+```
+## TODO
+- port model ensemble (fairseq uses 4 model checkpoints)

wmt19-en-ru/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "architectures": [
+    "FSMTForConditionalGeneration"
+  ],
+  "model_type": "fsmt",
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "attention_dropout": 0.1,
+  "d_model": 1024,
+  "dropout": 0.2,
+  "init_std": 0.02,
+  "max_position_embeddings": 1024,
+  "num_hidden_layers": 6,
+  "src_vocab_size": 31640,
+  "tgt_vocab_size": 31232,
+  "langs": [
+    "en",
+    "ru"
+  ],
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 8192,
+  "encoder_layerdrop": 0,
+  "encoder_layers": 6,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0,
+  "decoder_layers": 6,
+  "bos_token_id": 0,
+  "pad_token_id": 1,
+  "eos_token_id": 2,
+  "is_encoder_decoder": true,
+  "scale_embedding": true,
+  "tie_word_embeddings": false,
+  "num_beams": 5,
+  "early_stopping": false,
+  "length_penalty": 1.15
+}

wmt19-en-ru/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "length_penalty": 1.15,
+  "max_length": 200,
+  "num_beams": 5,
+  "pad_token_id": 1,
+  "transformers_version": "4.27.0.dev0"
+}

wmt19-en-ru/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

wmt19-en-ru/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0600405208c25794cb81244e3c2ded3d288981d7181b3616834f3e05713f940
+size 1164465181

wmt19-en-ru/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "langs": [
+    "en",
+    "ru"
+  ],
+  "model_max_length": 1024
+}

wmt19-en-ru/vocab-src.json ADDED Viewed

The diff for this file is too large to render. See raw diff

wmt19-en-ru/vocab-tgt.json ADDED Viewed

The diff for this file is too large to render. See raw diff