Spaces:

rusteam
/

Tglang_programming_langugage_detection

Runtime error

App Files Files Community

rusteam commited on Nov 21, 2023

Commit

88fb14c

•

1 Parent(s): 360fa3b

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

README.md +78 -0
app.py +50 -21

README.md CHANGED Viewed

@@ -4,3 +4,81 @@ app_file: app.py
 sdk: gradio
 sdk_version: 4.5.0
 ---

 sdk: gradio
 sdk_version: 4.5.0
 ---
+# Tglang - identify a programming language of a code snippet
+[github repo](https://github.com/Rusteam/tglang)
+This is a solution for [Telegram hackathon](https://contest.com/docs/ML-Competition-2023-r2).
+The list of supported languages:
+```markdown
+  TGLANG_LANGUAGE_C
+  TGLANG_LANGUAGE_CPLUSPLUS
+  TGLANG_LANGUAGE_CSHARP
+  TGLANG_LANGUAGE_CSS
+  TGLANG_LANGUAGE_DART
+  TGLANG_LANGUAGE_DOCKER
+  TGLANG_LANGUAGE_FUNC
+  TGLANG_LANGUAGE_GO
+  TGLANG_LANGUAGE_HTML
+  TGLANG_LANGUAGE_JAVA
+  TGLANG_LANGUAGE_JAVASCRIPT
+  TGLANG_LANGUAGE_JSON
+  TGLANG_LANGUAGE_KOTLIN
+  TGLANG_LANGUAGE_LUA
+  TGLANG_LANGUAGE_NGINX
+  TGLANG_LANGUAGE_OBJECTIVE_C
+  TGLANG_LANGUAGE_PHP
+  TGLANG_LANGUAGE_POWERSHELL
+  TGLANG_LANGUAGE_PYTHON
+  TGLANG_LANGUAGE_RUBY
+  TGLANG_LANGUAGE_RUST
+  TGLANG_LANGUAGE_SHELL
+  TGLANG_LANGUAGE_SOLIDITY
+  TGLANG_LANGUAGE_SQL
+  TGLANG_LANGUAGE_SWIFT
+  TGLANG_LANGUAGE_TL
+  TGLANG_LANGUAGE_TYPESCRIPT
+  TGLANG_LANGUAGE_XML
+```
+Other programming languages and non-code text are identified
+as `TGLANG_LANGUAGE_OTHER` (index 0).
+## Model development
+### Data
+- Training data consisted of 3.7k+ files with 220k+ lines of code.
+It consisted of files from the [Stack dataset](https://huggingface.co/datasets/bigcode/the-stack/viewer/default/train)
+and manually collected from GitHub.
+- Test set was manually labelled from [Telegram r1 files](https://data-static.usercontent.dev/ml2023-r1-dataset.tar.gz)
+It consisted of 493 files and 7404 lines of code. Not all classes are present in the test set.
+- Train files were split into shorter sequences of lines to
+match the test files' length.
+- OTHER files from the telegram files were added to the train set
+to make up 20% of the data and to the test set to make up 50% of the data.
+### Model
+1. Tokenizer - a simple text tokenizer is used to extract
+keywords and special characters from the code. Numbers,
+comments and docstrings are removed.
+2. Text embedding - a TfIdf vectorizer is used to extract
+features from the train set. TfIdf params are:
+```python
+    max_features=1000,
+    binary=True,
+    ngram_range=(1,1),
+    tokenizer=tokenize_text,
+    lowercase=False,
+```
+3. Classifier - a simple multinomial naive bayes is trained on
+vectorizer output.
+### Results
+- Accuracy on the test set: 0.82
+- Accuracy on the validation set: 0.83

app.py CHANGED Viewed

@@ -9,19 +9,39 @@ tokenizer = None
 lang_enum = None
 TITLE = "Tglang: Programming Language Detection"
-DESCRIPTION = ("<h5 style=\"text-align:center\">"
-               "Enter a code snippet and the model will predict the programming language it is written in.\n\n"
-               "Alternatively, it's possible to select one example from the dropdown menu to see how the model works.<h5>")
-FOOTER = ("This is a solution for the "
-            "[Telegram ML competition 2023, Round 2](https://contest.com/docs/ML-Competition-2023-r2).\n\n"
-          "For more details, read [this article]() or check out [this repo]()")
 EXAMPLES = [
     ["def foo():\n    print('Hello, world!')", "TGLANG_LANGUAGE_PYTHON"],
-    ["int main() {\n    printf(\"Hello, world!\");\n    return 0;\n}", "TGLANG_LANGUAGE_C"],
-    ["function foo() {\n    console.log('Hello, world!');\n}", "TGLANG_LANGUAGE_JAVASCRIPT"],
-    ["public class HelloWorld {\n    public static void main(String[] args) {\n        System.out.println(\"Hello, world!\");\n    }\n}", "TGLANG_LANGUAGE_JAVA"],
-    ["#include <iostream>\n\nint main() {\n    std::cout << \"Hello, world!\" << std::endl;\n}", "TGLANG_LANGUAGE_CPP"],
-    ["using System;\n\npublic class Program\n{\n    public static void Main()\n    {\n        Console.WriteLine(\"Hello, world!\");\n    }\n}", "TGLANG_LANGUAGE_CSHARP"],
 ]
@@ -29,7 +49,13 @@ def init_model():
     global model, tokenizer, lang_enum
     tokenizer = pyonmttok.Tokenizer("conservative")
     model = torch.jit.load(Path(__file__).with_name("tglang.pt"))
-    lang_enum = Path(__file__).with_name("langs_enum_r2.txt").read_text().strip().split("\n")
     lang_enum = [l.strip() for l in lang_enum if bool(l)]
@@ -42,15 +68,18 @@ def predict(text):
 def create_demo():
     init_model()
-    demo = gr.Interface(fn=predict,
-                        inputs=gr.Textbox(label="Code snippet", placeholder="Enter code here..."),
-                        outputs=gr.Textbox(label="Model prediction"),
-                        title=TITLE,
-                        description=DESCRIPTION,
-                        examples=EXAMPLES,
-                    theme=gr.themes.Monochrome(),
-                        article=FOOTER,
-                        )
     return demo

 lang_enum = None
 TITLE = "Tglang: Programming Language Detection"
+DESCRIPTION = (
+    '<h5 style="text-align:center">'
+    "Enter a code snippet and the model will predict the programming language it is written in.\n\n"
+    "Alternatively, it's possible to select one example from the dropdown menu to see how the model works.<h5>"
+)
+FOOTER = (
+    "This is a solution for the "
+    "[Telegram ML competition 2023, Round 2](https://contest.com/docs/ML-Competition-2023-r2).\n\n"
+    "For more details, read [this article]()"
+    "or check out [this repo](https://github.com/Rusteam/tglang)"
+)
 EXAMPLES = [
     ["def foo():\n    print('Hello, world!')", "TGLANG_LANGUAGE_PYTHON"],
+    [
+        'int main() {\n    printf("Hello, world!");\n    return 0;\n}',
+        "TGLANG_LANGUAGE_C",
+    ],
+    [
+        "function foo() {\n    console.log('Hello, world!');\n}",
+        "TGLANG_LANGUAGE_JAVASCRIPT",
+    ],
+    [
+        'public class HelloWorld {\n    public static void main(String[] args) {\n        System.out.println("Hello, world!");\n    }\n}',
+        "TGLANG_LANGUAGE_JAVA",
+    ],
+    [
+        '#include <iostream>\n\nint main() {\n    std::cout << "Hello, world!" << std::endl;\n}',
+        "TGLANG_LANGUAGE_CPP",
+    ],
+    [
+        'using System;\n\npublic class Program\n{\n    public static void Main()\n    {\n        Console.WriteLine("Hello, world!");\n    }\n}',
+        "TGLANG_LANGUAGE_CSHARP",
+    ],
 ]
     global model, tokenizer, lang_enum
     tokenizer = pyonmttok.Tokenizer("conservative")
     model = torch.jit.load(Path(__file__).with_name("tglang.pt"))
+    lang_enum = (
+        Path(__file__)
+        .with_name("langs_enum_r2.txt")
+        .read_text()
+        .strip()
+        .split("\n")
+    )
     lang_enum = [l.strip() for l in lang_enum if bool(l)]
 def create_demo():
     init_model()
+    demo = gr.Interface(
+        fn=predict,
+        inputs=gr.Textbox(
+            label="Code snippet", placeholder="Enter code here..."
+        ),
+        outputs=gr.Textbox(label="Model prediction"),
+        title=TITLE,
+        description=DESCRIPTION,
+        examples=EXAMPLES,
+        theme=gr.themes.Monochrome(),
+        article=FOOTER,
+    )
     return demo