rusteam commited on
Commit
88fb14c
1 Parent(s): 360fa3b

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +78 -0
  2. app.py +50 -21
README.md CHANGED
@@ -4,3 +4,81 @@ app_file: app.py
4
  sdk: gradio
5
  sdk_version: 4.5.0
6
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  sdk: gradio
5
  sdk_version: 4.5.0
6
  ---
7
+ # Tglang - identify a programming language of a code snippet
8
+
9
+ [github repo](https://github.com/Rusteam/tglang)
10
+
11
+ This is a solution for [Telegram hackathon](https://contest.com/docs/ML-Competition-2023-r2).
12
+
13
+ The list of supported languages:
14
+ ```markdown
15
+ TGLANG_LANGUAGE_C
16
+ TGLANG_LANGUAGE_CPLUSPLUS
17
+ TGLANG_LANGUAGE_CSHARP
18
+ TGLANG_LANGUAGE_CSS
19
+ TGLANG_LANGUAGE_DART
20
+ TGLANG_LANGUAGE_DOCKER
21
+ TGLANG_LANGUAGE_FUNC
22
+ TGLANG_LANGUAGE_GO
23
+ TGLANG_LANGUAGE_HTML
24
+ TGLANG_LANGUAGE_JAVA
25
+ TGLANG_LANGUAGE_JAVASCRIPT
26
+ TGLANG_LANGUAGE_JSON
27
+ TGLANG_LANGUAGE_KOTLIN
28
+ TGLANG_LANGUAGE_LUA
29
+ TGLANG_LANGUAGE_NGINX
30
+ TGLANG_LANGUAGE_OBJECTIVE_C
31
+ TGLANG_LANGUAGE_PHP
32
+ TGLANG_LANGUAGE_POWERSHELL
33
+ TGLANG_LANGUAGE_PYTHON
34
+ TGLANG_LANGUAGE_RUBY
35
+ TGLANG_LANGUAGE_RUST
36
+ TGLANG_LANGUAGE_SHELL
37
+ TGLANG_LANGUAGE_SOLIDITY
38
+ TGLANG_LANGUAGE_SQL
39
+ TGLANG_LANGUAGE_SWIFT
40
+ TGLANG_LANGUAGE_TL
41
+ TGLANG_LANGUAGE_TYPESCRIPT
42
+ TGLANG_LANGUAGE_XML
43
+ ```
44
+
45
+ Other programming languages and non-code text are identified
46
+ as `TGLANG_LANGUAGE_OTHER` (index 0).
47
+
48
+ ## Model development
49
+
50
+ ### Data
51
+
52
+ - Training data consisted of 3.7k+ files with 220k+ lines of code.
53
+ It consisted of files from the [Stack dataset](https://huggingface.co/datasets/bigcode/the-stack/viewer/default/train)
54
+ and manually collected from GitHub.
55
+ - Test set was manually labelled from [Telegram r1 files](https://data-static.usercontent.dev/ml2023-r1-dataset.tar.gz)
56
+ It consisted of 493 files and 7404 lines of code. Not all classes are present in the test set.
57
+ - Train files were split into shorter sequences of lines to
58
+ match the test files' length.
59
+ - OTHER files from the telegram files were added to the train set
60
+ to make up 20% of the data and to the test set to make up 50% of the data.
61
+
62
+ ### Model
63
+
64
+
65
+ 1. Tokenizer - a simple text tokenizer is used to extract
66
+ keywords and special characters from the code. Numbers,
67
+ comments and docstrings are removed.
68
+ 2. Text embedding - a TfIdf vectorizer is used to extract
69
+ features from the train set. TfIdf params are:
70
+ ```python
71
+ max_features=1000,
72
+ binary=True,
73
+ ngram_range=(1,1),
74
+ tokenizer=tokenize_text,
75
+ lowercase=False,
76
+ ```
77
+ 3. Classifier - a simple multinomial naive bayes is trained on
78
+ vectorizer output.
79
+
80
+ ### Results
81
+
82
+ - Accuracy on the test set: 0.82
83
+ - Accuracy on the validation set: 0.83
84
+
app.py CHANGED
@@ -9,19 +9,39 @@ tokenizer = None
9
  lang_enum = None
10
 
11
  TITLE = "Tglang: Programming Language Detection"
12
- DESCRIPTION = ("<h5 style=\"text-align:center\">"
13
- "Enter a code snippet and the model will predict the programming language it is written in.\n\n"
14
- "Alternatively, it's possible to select one example from the dropdown menu to see how the model works.<h5>")
15
- FOOTER = ("This is a solution for the "
16
- "[Telegram ML competition 2023, Round 2](https://contest.com/docs/ML-Competition-2023-r2).\n\n"
17
- "For more details, read [this article]() or check out [this repo]()")
 
 
 
 
 
18
  EXAMPLES = [
19
  ["def foo():\n print('Hello, world!')", "TGLANG_LANGUAGE_PYTHON"],
20
- ["int main() {\n printf(\"Hello, world!\");\n return 0;\n}", "TGLANG_LANGUAGE_C"],
21
- ["function foo() {\n console.log('Hello, world!');\n}", "TGLANG_LANGUAGE_JAVASCRIPT"],
22
- ["public class HelloWorld {\n public static void main(String[] args) {\n System.out.println(\"Hello, world!\");\n }\n}", "TGLANG_LANGUAGE_JAVA"],
23
- ["#include <iostream>\n\nint main() {\n std::cout << \"Hello, world!\" << std::endl;\n}", "TGLANG_LANGUAGE_CPP"],
24
- ["using System;\n\npublic class Program\n{\n public static void Main()\n {\n Console.WriteLine(\"Hello, world!\");\n }\n}", "TGLANG_LANGUAGE_CSHARP"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  ]
26
 
27
 
@@ -29,7 +49,13 @@ def init_model():
29
  global model, tokenizer, lang_enum
30
  tokenizer = pyonmttok.Tokenizer("conservative")
31
  model = torch.jit.load(Path(__file__).with_name("tglang.pt"))
32
- lang_enum = Path(__file__).with_name("langs_enum_r2.txt").read_text().strip().split("\n")
 
 
 
 
 
 
33
  lang_enum = [l.strip() for l in lang_enum if bool(l)]
34
 
35
 
@@ -42,15 +68,18 @@ def predict(text):
42
 
43
  def create_demo():
44
  init_model()
45
- demo = gr.Interface(fn=predict,
46
- inputs=gr.Textbox(label="Code snippet", placeholder="Enter code here..."),
47
- outputs=gr.Textbox(label="Model prediction"),
48
- title=TITLE,
49
- description=DESCRIPTION,
50
- examples=EXAMPLES,
51
- theme=gr.themes.Monochrome(),
52
- article=FOOTER,
53
- )
 
 
 
54
  return demo
55
 
56
 
 
9
  lang_enum = None
10
 
11
  TITLE = "Tglang: Programming Language Detection"
12
+ DESCRIPTION = (
13
+ '<h5 style="text-align:center">'
14
+ "Enter a code snippet and the model will predict the programming language it is written in.\n\n"
15
+ "Alternatively, it's possible to select one example from the dropdown menu to see how the model works.<h5>"
16
+ )
17
+ FOOTER = (
18
+ "This is a solution for the "
19
+ "[Telegram ML competition 2023, Round 2](https://contest.com/docs/ML-Competition-2023-r2).\n\n"
20
+ "For more details, read [this article]()"
21
+ "or check out [this repo](https://github.com/Rusteam/tglang)"
22
+ )
23
  EXAMPLES = [
24
  ["def foo():\n print('Hello, world!')", "TGLANG_LANGUAGE_PYTHON"],
25
+ [
26
+ 'int main() {\n printf("Hello, world!");\n return 0;\n}',
27
+ "TGLANG_LANGUAGE_C",
28
+ ],
29
+ [
30
+ "function foo() {\n console.log('Hello, world!');\n}",
31
+ "TGLANG_LANGUAGE_JAVASCRIPT",
32
+ ],
33
+ [
34
+ 'public class HelloWorld {\n public static void main(String[] args) {\n System.out.println("Hello, world!");\n }\n}',
35
+ "TGLANG_LANGUAGE_JAVA",
36
+ ],
37
+ [
38
+ '#include <iostream>\n\nint main() {\n std::cout << "Hello, world!" << std::endl;\n}',
39
+ "TGLANG_LANGUAGE_CPP",
40
+ ],
41
+ [
42
+ 'using System;\n\npublic class Program\n{\n public static void Main()\n {\n Console.WriteLine("Hello, world!");\n }\n}',
43
+ "TGLANG_LANGUAGE_CSHARP",
44
+ ],
45
  ]
46
 
47
 
 
49
  global model, tokenizer, lang_enum
50
  tokenizer = pyonmttok.Tokenizer("conservative")
51
  model = torch.jit.load(Path(__file__).with_name("tglang.pt"))
52
+ lang_enum = (
53
+ Path(__file__)
54
+ .with_name("langs_enum_r2.txt")
55
+ .read_text()
56
+ .strip()
57
+ .split("\n")
58
+ )
59
  lang_enum = [l.strip() for l in lang_enum if bool(l)]
60
 
61
 
 
68
 
69
  def create_demo():
70
  init_model()
71
+ demo = gr.Interface(
72
+ fn=predict,
73
+ inputs=gr.Textbox(
74
+ label="Code snippet", placeholder="Enter code here..."
75
+ ),
76
+ outputs=gr.Textbox(label="Model prediction"),
77
+ title=TITLE,
78
+ description=DESCRIPTION,
79
+ examples=EXAMPLES,
80
+ theme=gr.themes.Monochrome(),
81
+ article=FOOTER,
82
+ )
83
  return demo
84
 
85