Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files
README.md
CHANGED
@@ -4,3 +4,81 @@ app_file: app.py
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.5.0
|
6 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.5.0
|
6 |
---
|
7 |
+
# Tglang - identify a programming language of a code snippet
|
8 |
+
|
9 |
+
[github repo](https://github.com/Rusteam/tglang)
|
10 |
+
|
11 |
+
This is a solution for [Telegram hackathon](https://contest.com/docs/ML-Competition-2023-r2).
|
12 |
+
|
13 |
+
The list of supported languages:
|
14 |
+
```markdown
|
15 |
+
TGLANG_LANGUAGE_C
|
16 |
+
TGLANG_LANGUAGE_CPLUSPLUS
|
17 |
+
TGLANG_LANGUAGE_CSHARP
|
18 |
+
TGLANG_LANGUAGE_CSS
|
19 |
+
TGLANG_LANGUAGE_DART
|
20 |
+
TGLANG_LANGUAGE_DOCKER
|
21 |
+
TGLANG_LANGUAGE_FUNC
|
22 |
+
TGLANG_LANGUAGE_GO
|
23 |
+
TGLANG_LANGUAGE_HTML
|
24 |
+
TGLANG_LANGUAGE_JAVA
|
25 |
+
TGLANG_LANGUAGE_JAVASCRIPT
|
26 |
+
TGLANG_LANGUAGE_JSON
|
27 |
+
TGLANG_LANGUAGE_KOTLIN
|
28 |
+
TGLANG_LANGUAGE_LUA
|
29 |
+
TGLANG_LANGUAGE_NGINX
|
30 |
+
TGLANG_LANGUAGE_OBJECTIVE_C
|
31 |
+
TGLANG_LANGUAGE_PHP
|
32 |
+
TGLANG_LANGUAGE_POWERSHELL
|
33 |
+
TGLANG_LANGUAGE_PYTHON
|
34 |
+
TGLANG_LANGUAGE_RUBY
|
35 |
+
TGLANG_LANGUAGE_RUST
|
36 |
+
TGLANG_LANGUAGE_SHELL
|
37 |
+
TGLANG_LANGUAGE_SOLIDITY
|
38 |
+
TGLANG_LANGUAGE_SQL
|
39 |
+
TGLANG_LANGUAGE_SWIFT
|
40 |
+
TGLANG_LANGUAGE_TL
|
41 |
+
TGLANG_LANGUAGE_TYPESCRIPT
|
42 |
+
TGLANG_LANGUAGE_XML
|
43 |
+
```
|
44 |
+
|
45 |
+
Other programming languages and non-code text are identified
|
46 |
+
as `TGLANG_LANGUAGE_OTHER` (index 0).
|
47 |
+
|
48 |
+
## Model development
|
49 |
+
|
50 |
+
### Data
|
51 |
+
|
52 |
+
- Training data consisted of 3.7k+ files with 220k+ lines of code.
|
53 |
+
It consisted of files from the [Stack dataset](https://huggingface.co/datasets/bigcode/the-stack/viewer/default/train)
|
54 |
+
and manually collected from GitHub.
|
55 |
+
- Test set was manually labelled from [Telegram r1 files](https://data-static.usercontent.dev/ml2023-r1-dataset.tar.gz)
|
56 |
+
It consisted of 493 files and 7404 lines of code. Not all classes are present in the test set.
|
57 |
+
- Train files were split into shorter sequences of lines to
|
58 |
+
match the test files' length.
|
59 |
+
- OTHER files from the telegram files were added to the train set
|
60 |
+
to make up 20% of the data and to the test set to make up 50% of the data.
|
61 |
+
|
62 |
+
### Model
|
63 |
+
|
64 |
+
|
65 |
+
1. Tokenizer - a simple text tokenizer is used to extract
|
66 |
+
keywords and special characters from the code. Numbers,
|
67 |
+
comments and docstrings are removed.
|
68 |
+
2. Text embedding - a TfIdf vectorizer is used to extract
|
69 |
+
features from the train set. TfIdf params are:
|
70 |
+
```python
|
71 |
+
max_features=1000,
|
72 |
+
binary=True,
|
73 |
+
ngram_range=(1,1),
|
74 |
+
tokenizer=tokenize_text,
|
75 |
+
lowercase=False,
|
76 |
+
```
|
77 |
+
3. Classifier - a simple multinomial naive bayes is trained on
|
78 |
+
vectorizer output.
|
79 |
+
|
80 |
+
### Results
|
81 |
+
|
82 |
+
- Accuracy on the test set: 0.82
|
83 |
+
- Accuracy on the validation set: 0.83
|
84 |
+
|
app.py
CHANGED
@@ -9,19 +9,39 @@ tokenizer = None
|
|
9 |
lang_enum = None
|
10 |
|
11 |
TITLE = "Tglang: Programming Language Detection"
|
12 |
-
DESCRIPTION = (
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
18 |
EXAMPLES = [
|
19 |
["def foo():\n print('Hello, world!')", "TGLANG_LANGUAGE_PYTHON"],
|
20 |
-
[
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
]
|
26 |
|
27 |
|
@@ -29,7 +49,13 @@ def init_model():
|
|
29 |
global model, tokenizer, lang_enum
|
30 |
tokenizer = pyonmttok.Tokenizer("conservative")
|
31 |
model = torch.jit.load(Path(__file__).with_name("tglang.pt"))
|
32 |
-
lang_enum =
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
lang_enum = [l.strip() for l in lang_enum if bool(l)]
|
34 |
|
35 |
|
@@ -42,15 +68,18 @@ def predict(text):
|
|
42 |
|
43 |
def create_demo():
|
44 |
init_model()
|
45 |
-
demo = gr.Interface(
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
54 |
return demo
|
55 |
|
56 |
|
|
|
9 |
lang_enum = None
|
10 |
|
11 |
TITLE = "Tglang: Programming Language Detection"
|
12 |
+
DESCRIPTION = (
|
13 |
+
'<h5 style="text-align:center">'
|
14 |
+
"Enter a code snippet and the model will predict the programming language it is written in.\n\n"
|
15 |
+
"Alternatively, it's possible to select one example from the dropdown menu to see how the model works.<h5>"
|
16 |
+
)
|
17 |
+
FOOTER = (
|
18 |
+
"This is a solution for the "
|
19 |
+
"[Telegram ML competition 2023, Round 2](https://contest.com/docs/ML-Competition-2023-r2).\n\n"
|
20 |
+
"For more details, read [this article]()"
|
21 |
+
"or check out [this repo](https://github.com/Rusteam/tglang)"
|
22 |
+
)
|
23 |
EXAMPLES = [
|
24 |
["def foo():\n print('Hello, world!')", "TGLANG_LANGUAGE_PYTHON"],
|
25 |
+
[
|
26 |
+
'int main() {\n printf("Hello, world!");\n return 0;\n}',
|
27 |
+
"TGLANG_LANGUAGE_C",
|
28 |
+
],
|
29 |
+
[
|
30 |
+
"function foo() {\n console.log('Hello, world!');\n}",
|
31 |
+
"TGLANG_LANGUAGE_JAVASCRIPT",
|
32 |
+
],
|
33 |
+
[
|
34 |
+
'public class HelloWorld {\n public static void main(String[] args) {\n System.out.println("Hello, world!");\n }\n}',
|
35 |
+
"TGLANG_LANGUAGE_JAVA",
|
36 |
+
],
|
37 |
+
[
|
38 |
+
'#include <iostream>\n\nint main() {\n std::cout << "Hello, world!" << std::endl;\n}',
|
39 |
+
"TGLANG_LANGUAGE_CPP",
|
40 |
+
],
|
41 |
+
[
|
42 |
+
'using System;\n\npublic class Program\n{\n public static void Main()\n {\n Console.WriteLine("Hello, world!");\n }\n}',
|
43 |
+
"TGLANG_LANGUAGE_CSHARP",
|
44 |
+
],
|
45 |
]
|
46 |
|
47 |
|
|
|
49 |
global model, tokenizer, lang_enum
|
50 |
tokenizer = pyonmttok.Tokenizer("conservative")
|
51 |
model = torch.jit.load(Path(__file__).with_name("tglang.pt"))
|
52 |
+
lang_enum = (
|
53 |
+
Path(__file__)
|
54 |
+
.with_name("langs_enum_r2.txt")
|
55 |
+
.read_text()
|
56 |
+
.strip()
|
57 |
+
.split("\n")
|
58 |
+
)
|
59 |
lang_enum = [l.strip() for l in lang_enum if bool(l)]
|
60 |
|
61 |
|
|
|
68 |
|
69 |
def create_demo():
|
70 |
init_model()
|
71 |
+
demo = gr.Interface(
|
72 |
+
fn=predict,
|
73 |
+
inputs=gr.Textbox(
|
74 |
+
label="Code snippet", placeholder="Enter code here..."
|
75 |
+
),
|
76 |
+
outputs=gr.Textbox(label="Model prediction"),
|
77 |
+
title=TITLE,
|
78 |
+
description=DESCRIPTION,
|
79 |
+
examples=EXAMPLES,
|
80 |
+
theme=gr.themes.Monochrome(),
|
81 |
+
article=FOOTER,
|
82 |
+
)
|
83 |
return demo
|
84 |
|
85 |
|