Spaces:
Build error
Build error
Commit
·
a13a8c8
1
Parent(s):
dd4f105
updated with new model
Browse files- app.py +47 -18
- cr_tokenizer.json +0 -0
- crv3.keras +2 -2
- requirements.txt +2 -0
app.py
CHANGED
|
@@ -1,23 +1,56 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
import tensorflow as tf
|
|
|
|
|
|
|
|
|
|
| 4 |
from tokenizers import Tokenizer
|
| 5 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 6 |
-
import re
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def replace_java_comments(code: str) -> str:
|
| 13 |
"""Replaces Java comments with placeholders."""
|
| 14 |
-
code = re.sub(r"//.*", "
|
| 15 |
-
code = re.sub(r"/\*[\s\S]*?\*/", "
|
| 16 |
-
return code.strip() #
|
| 17 |
|
| 18 |
def tokenize_java_code(code: str, max_length=100):
|
| 19 |
-
"""
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
|
| 22 |
return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
|
| 23 |
|
|
@@ -32,18 +65,14 @@ def classify_code(input_text, input_file):
|
|
| 32 |
if not code.strip(): # Ensure input is not empty
|
| 33 |
return "Please provide a Java code snippet."
|
| 34 |
|
| 35 |
-
# Replace comments before tokenization
|
| 36 |
-
cleaned_code = replace_java_comments(code)
|
| 37 |
-
|
| 38 |
# Tokenize and predict
|
| 39 |
-
tokenized_code = tokenize_java_code(
|
| 40 |
prediction = model.predict(tokenized_code)[0][0]
|
| 41 |
|
| 42 |
-
threshold = 0.52
|
| 43 |
-
prediction = (prediction > threshold).astype(int) # Convert
|
| 44 |
|
| 45 |
-
|
| 46 |
-
return "Readable" if prediction > 0.5 else "Unreadable"
|
| 47 |
|
| 48 |
gr.Interface(
|
| 49 |
fn=classify_code,
|
|
@@ -52,7 +81,7 @@ gr.Interface(
|
|
| 52 |
gr.File(type="binary", label="Upload Java File (.java)")
|
| 53 |
],
|
| 54 |
outputs=gr.Text(label="Readability Classification"),
|
| 55 |
-
title="Java Code Readability Classifier",
|
| 56 |
description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
|
| 57 |
allow_flagging="never"
|
| 58 |
).launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
import tensorflow as tf
|
| 4 |
+
import re
|
| 5 |
+
from tree_sitter import Language, Parser
|
| 6 |
+
import tree_sitter_languages # Pre-built parsers for multiple languages
|
| 7 |
from tokenizers import Tokenizer
|
| 8 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
|
|
| 9 |
|
| 10 |
+
tokenizer = Tokenizer.from_file("syntax_bpe_tokenizer.json") # New BPE tokenizer
|
| 11 |
+
model = tf.keras.models.load_model("crv3.keras") # CNN model
|
| 12 |
+
|
| 13 |
+
parser = Parser()
|
| 14 |
+
parser.set_language(tree_sitter_languages.get_language("java"))
|
| 15 |
+
|
| 16 |
+
def syntax_aware_tokenize(code):
|
| 17 |
+
"""Tokenizes Java code using Tree-Sitter (AST-based)."""
|
| 18 |
+
tree = parser.parse(bytes(code, "utf8"))
|
| 19 |
+
root_node = tree.root_node
|
| 20 |
+
tokens = []
|
| 21 |
+
|
| 22 |
+
def extract_tokens(node):
|
| 23 |
+
"""Recursively extracts tokens from AST."""
|
| 24 |
+
if node.child_count == 0: # Leaf node
|
| 25 |
+
tokens.append(node.text.decode("utf-8"))
|
| 26 |
+
for child in node.children:
|
| 27 |
+
extract_tokens(child)
|
| 28 |
+
|
| 29 |
+
extract_tokens(root_node)
|
| 30 |
+
return tokens # Returns structured syntax tokens
|
| 31 |
|
| 32 |
def replace_java_comments(code: str) -> str:
|
| 33 |
"""Replaces Java comments with placeholders."""
|
| 34 |
+
code = re.sub(r"//.*", " // ", code) # Replace single-line comments
|
| 35 |
+
code = re.sub(r"/\*[\s\S]*?\*/", " /**/ ", code) # Replace multi-line comments
|
| 36 |
+
return code.strip() # Preserve indentation and code structure
|
| 37 |
|
| 38 |
def tokenize_java_code(code: str, max_length=100):
|
| 39 |
+
"""
|
| 40 |
+
Tokenizes and pads Java code using AST tokenization + BPE.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
code (str): Java code snippet.
|
| 44 |
+
max_length (int): Maximum sequence length.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
np.array: Tokenized and padded sequence.
|
| 48 |
+
"""
|
| 49 |
+
cleaned_code = replace_java_comments(code) # Preprocess comments
|
| 50 |
+
syntax_tokens = syntax_aware_tokenize(cleaned_code) # Extract AST tokens
|
| 51 |
+
encoded = tokenizer.encode(" ".join(syntax_tokens)).ids # Apply BPE
|
| 52 |
+
|
| 53 |
+
# Pad the sequence
|
| 54 |
padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
|
| 55 |
return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
|
| 56 |
|
|
|
|
| 65 |
if not code.strip(): # Ensure input is not empty
|
| 66 |
return "Please provide a Java code snippet."
|
| 67 |
|
|
|
|
|
|
|
|
|
|
| 68 |
# Tokenize and predict
|
| 69 |
+
tokenized_code = tokenize_java_code(code)
|
| 70 |
prediction = model.predict(tokenized_code)[0][0]
|
| 71 |
|
| 72 |
+
threshold = 0.52 # Adjust threshold for classification
|
| 73 |
+
prediction = (prediction > threshold).astype(int) # Convert probability to binary
|
| 74 |
|
| 75 |
+
return "Readable" if prediction == 1 else "Unreadable"
|
|
|
|
| 76 |
|
| 77 |
gr.Interface(
|
| 78 |
fn=classify_code,
|
|
|
|
| 81 |
gr.File(type="binary", label="Upload Java File (.java)")
|
| 82 |
],
|
| 83 |
outputs=gr.Text(label="Readability Classification"),
|
| 84 |
+
title="Java Code Readability Classifier (AST + BPE)",
|
| 85 |
description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
|
| 86 |
allow_flagging="never"
|
| 87 |
).launch()
|
cr_tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
crv3.keras
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8ad1fb4d7eee878b3ce7282bdb8a5b428b2b940cdb615e662c649df4685f0e9
|
| 3 |
+
size 2357365
|
requirements.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
gradio
|
| 2 |
tensorflow
|
| 3 |
tokenizers
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
tensorflow
|
| 3 |
tokenizers
|
| 4 |
+
tree_sitter
|
| 5 |
+
tree_sitter_languages
|