hmnshudhmn24 commited on 17 days ago

Commit

b19c92c

verified ·

1 Parent(s): 7e00db6

Upload 40 files

Browse files

Files changed (40) hide show

MODEL_CARD.md +2 -0
README.md +2 -3
config/refactor_rules.yaml +12 -0
datasets/README.md +1 -0
datasets/code_repo_raw/README.txt +1 -0
datasets/metadata.json +1 -0
datasets/multilingual_code_clean/README.txt +1 -0
deployment/Dockerfile +6 -0
deployment/huggingface_spaces/README.md +1 -0
deployment/huggingface_spaces/app.py +2 -0
deployment/huggingface_spaces/requirements.txt +1 -0
evaluation/evaluate.py +12 -0
inference/api_server.py +0 -0
inference/cli.py +18 -0
inference/gradio_app.py +2 -0
notes.md +1 -0
requirements.txt +14 -0
setup.cfg +2 -0
src/universal_refactor/__init__.py +1 -0
src/universal_refactor/bug_detector.py +37 -0
src/universal_refactor/code_converter.py +25 -0
src/universal_refactor/embeddings/ast_embeddings.py +1 -0
src/universal_refactor/embeddings/code_embeddings.py +1 -0
src/universal_refactor/long_context_manager.py +1 -0
src/universal_refactor/model.py +41 -0
src/universal_refactor/patch_generator.py +8 -0
src/universal_refactor/pipelines.py +21 -0
src/universal_refactor/refactor_engine.py +47 -0
src/universal_refactor/tokenizer.py +7 -0
src/universal_refactor/utils.py +12 -0
tests/test_python_refactor.py +2 -0
training/distributed/run_deepspeed.sh +1 -0
training/distributed/slurm_job.sh +1 -0
training/distributed/zero3_config.json +1 -0
training/finetune_bugfix.py +1 -0
training/finetune_convert.py +1 -0
training/finetune_refactor.py +1 -0
training/long_context_training.py +1 -0
training/pretrain.py +1 -0
training/tokenizer_training.py +1 -0

MODEL_CARD.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Model Card
2	+ ...

README.md CHANGED Viewed

@@ -1,3 +1,2 @@
----
-license: apache-2.0
----


1	+ # Universal-Code-Refactor-32B (Compact Full Implementation)
2	+ This repo contains a compact yet real implementation...

config/refactor_rules.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+python:
+  format: true
+  remove_unused_imports: true
+  inline_simple_functions: true
+java:
+  format: true
+  convert_for_each: true
+javascript:
+  format: false
+  convert_var_to_let: true

datasets/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # dataset readme

datasets/code_repo_raw/README.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ raw

datasets/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"description":"meta"}

datasets/multilingual_code_clean/README.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ clean

deployment/Dockerfile ADDED Viewed

	@@ -0,0 +1,6 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . /app
+RUN pip install -r requirements.txt
+EXPOSE 8000
+CMD ['uvicorn','inference.api_server:app','--host','0.0.0.0','--port','8000']

deployment/huggingface_spaces/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ HF space readme

deployment/huggingface_spaces/app.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import gradio as gr
2	+ ...

deployment/huggingface_spaces/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio

evaluation/evaluate.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from radon.complexity import cc_visit
+from src.universal_refactor.patch_generator import PatchGenerator
+def compute_complexity(c):
+    try: return sum(b.complexity for b in cc_visit(c))
+    except: return 0
+def evaluate_pair(a,b):
+    diff=PatchGenerator.unified_diff(a,b)
+    oc,nc=compute_complexity(a),compute_complexity(b)
+    red=(oc-nc)/oc if oc else 0
+    ch=sum(1 for l in diff.splitlines() if l.startswith('+') or l.startswith('-'))
+    clean=max(0,1-min(ch/400,1))
+    return {'complexity_reduction':red,'patch_cleanliness':clean,'diff':diff}

inference/api_server.py ADDED Viewed

Binary file (1.65 kB). View file

inference/cli.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import argparse
+from src.universal_refactor.pipelines import InferencePipeline
+from src.universal_refactor.utils import read_text
+parser=argparse.ArgumentParser()
+parser.add_argument('--mode',choices=['refactor','convert'],default='refactor')
+parser.add_argument('--file');parser.add_argument('--lang',default='python')
+parser.add_argument('--src');parser.add_argument('--tgt')
+def main():
+    a=parser.parse_args();pipe=InferencePipeline()
+    code=read_text(a.file)
+    if a.mode=='refactor':
+        out=pipe.analyze_and_refactor(code,a.lang);print(out)
+    else:
+        print(pipe.convert(code,a.src,a.tgt))
+if __name__=='__main__': main()

inference/gradio_app.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import gradio as gr
2	+ ...

notes.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Developer Notes

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+fastapi
+uvicorn[standard]
+transformers
+tokenizers
+torch>=1.13
+javalang
+black
+isort
+radon
+pyyaml
+gitpython
+python-Levenshtein
+gradio
+requests

setup.cfg ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [flake8]
2	+ max-line-length=120

src/universal_refactor/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __all__ = ['RefactorEngine','BugDetector','CodeConverter','PatchGenerator','InferencePipeline','LongContextManager']

src/universal_refactor/bug_detector.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import ast, javalang, re
+class BugDetector:
+    def detect_none_deref_python(self,code):
+        issues=[];
+        try:
+            t=ast.parse(code)
+            for n in ast.walk(t):
+                if isinstance(n,ast.Attribute) and isinstance(n.value,ast.Name):
+                    issues.append({'line':n.lineno,'msg':f"Possible None dereference: {n.value.id}"})
+        except: pass
+        return issues
+    def detect_unused_vars_python(self,code):
+        issues=[]
+        try:
+            t=ast.parse(code);assigned=set();used=set()
+            for n in ast.walk(t):
+                if isinstance(n,ast.Assign):
+                    for t2 in n.targets:
+                        if isinstance(t2,ast.Name): assigned.add(t2.id)
+                if isinstance(n,ast.Name) and isinstance(n.ctx,ast.Load): used.add(n.id)
+            for v in assigned-used: issues.append({'msg':f"Unused variable {v}"})
+        except: pass
+        return issues
+    def detect_null_checks_java(self,code):
+        issues=[]
+        try:
+            for m in re.finditer(r"(\w+)\.\w+\(",code):
+                v=m.group(1);ctx=code[max(0,m.start()-200):m.end()+200]
+                if not re.search(rf"if *\( *{v} *!= *null",ctx):
+                    issues.append({'msg':f"Possible null dereference: {v}"})
+        except: pass
+        return issues
+    def analyze(self,code,lang):
+        lang=lang.lower()
+        if lang=='python': return self.detect_none_deref_python(code)+self.detect_unused_vars_python(code)
+        if lang=='java': return self.detect_null_checks_java(code)
+        return []

src/universal_refactor/code_converter.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import ast, astor, javalang
+class CodeConverter:
+    def python_to_java(self,code):
+        try: t=ast.parse(code)
+        except: return "// parse error\n"+code
+        out=["public class Converted {","  public static void main(String[] args) {"]
+        for n in t.body:
+            if isinstance(n,ast.Expr) and isinstance(n.value,ast.Call) and getattr(n.value.func,'id','')=='print':
+                args=", ".join(astor.to_source(a).strip() for a in n.value.args)
+                out.append(f"    System.out.println({args});")
+        out.append("  }"); out.append("}")
+        return "\n".join(out)
+    def java_to_python(self,code):
+        out=["# Converted from Java"]
+        try: tree=javalang.parse.parse(code)
+        except: return "# parse error\n"+code
+        for _,m in tree.filter(javalang.tree.MethodDeclaration):
+            out.append(f"def {m.name}():\n    pass")
+        return "\n".join(out)
+    def convert(self,code,src,tgt):
+        s,t=src.lower(),tgt.lower()
+        if s=='python' and t=='java': return self.python_to_java(code)
+        if s=='java' and t=='python': return self.java_to_python(code)
+        return code

src/universal_refactor/embeddings/ast_embeddings.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ def ast_features(c): return {}

src/universal_refactor/embeddings/code_embeddings.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ class CodeEmbedder: pass

src/universal_refactor/long_context_manager.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ class LongContextManager: pass

src/universal_refactor/model.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+import math
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=8192):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1).float()
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        t = x.size(1)
+        return x + self.pe[:t].unsqueeze(0)
+class SmallCodeTransformer(nn.Module):
+    def __init__(self, vocab_size, d_model=512, nhead=8, nlayers=6, dim_feed=2048, max_len=8192):
+        super().__init__()
+        self.token_emb = nn.Embedding(vocab_size, d_model)
+        self.pos = PositionalEncoding(d_model, max_len)
+        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feed, dropout=0.1, activation="gelu")
+        self.encoder = nn.TransformerEncoder(encoder_layer, nlayers)
+        self.ln = nn.LayerNorm(d_model)
+        self.head = nn.Linear(d_model, vocab_size, bias=False)
+        self._init_weights()
+    def _init_weights(self):
+        nn.init.normal_(self.token_emb.weight, mean=0.0, std=0.02)
+        nn.init.normal_(self.head.weight, mean=0.0, std=0.02)
+    def forward(self, input_ids, attention_mask=None):
+        x = self.token_emb(input_ids)
+        x = self.pos(x)
+        x = x.permute(1,0,2)
+        x = self.encoder(x, src_key_padding_mask=(attention_mask==0) if attention_mask is not None else None)
+        x = x.permute(1,0,2)
+        x = self.ln(x)
+        return self.head(x)

src/universal_refactor/patch_generator.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import difflib
+class PatchGenerator:
+    @staticmethod
+    def unified_diff(a,b,filename='file'):
+        return ''.join(difflib.unified_diff(a.splitlines(True),b.splitlines(True),fromfile=filename,tofile=filename+'.refactored'))
+    @staticmethod
+    def summarize_patch(d,maxl=20):
+        return '\n'.join(d.splitlines()[:maxl])

src/universal_refactor/pipelines.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .refactor_engine import RefactorEngine
+from .bug_detector import BugDetector
+from .code_converter import CodeConverter
+from .patch_generator import PatchGenerator
+from .tokenizer import get_tokenizer
+from .model import SmallCodeTransformer
+class InferencePipeline:
+    def __init__(self):
+        self.refactor=RefactorEngine()
+        self.bugs=BugDetector()
+        self.convert_engine=CodeConverter()
+        self.patch=PatchGenerator()
+        self.tokenizer=get_tokenizer()
+        self.model=SmallCodeTransformer(vocab_size=self.tokenizer.vocab_size)
+    def analyze_and_refactor(self,code,lang):
+        issues=self.bugs.analyze(code,lang)
+        ref=self.refactor.refactor(code,lang)
+        diff=self.patch.unified_diff(code,ref,f"code.{lang}")
+        return {'issues':issues,'refactored':ref,'diff':diff}
+    def convert(self,code,src,tgt):
+        return {'converted':self.convert_engine.convert(code,src,tgt)}

src/universal_refactor/refactor_engine.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import ast, astor, javalang, re, yaml, black, isort
+from .utils import run_cmd
+class RefactorEngine:
+    def __init__(self, rules_path='config/refactor_rules.yaml'):
+        with open(rules_path) as f: self.rules=yaml.safe_load(f)
+    def format_python(self,code):
+        try: return isort.code(black.format_str(code,mode=black.FileMode()))
+        except: return code
+    def remove_unused_imports_python(self,code):
+        try:
+            t=ast.parse(code);im=[n for n in t.body if isinstance(n,(ast.Import,ast.ImportFrom))]
+            used={n.id for n in ast.walk(t) if isinstance(n,ast.Name)}
+            keep=[i for i in im if any((a.asname or a.name.split('.')[0]) in used for a in i.names)]
+            t.body=keep+[n for n in t.body if n not in im];return astor.to_source(t)
+        except: return code
+    def inline_simple_functions_python(self,code):
+        try:
+            t=ast.parse(code);funcs={}
+            for n in t.body:
+                if isinstance(n,ast.FunctionDef) and len(n.body)==1 and isinstance(n.body[0],ast.Return):
+                    funcs[n.name]=astor.to_source(n.body[0].value).strip()
+            out=code
+            for f,b in funcs.items(): out=re.sub(rf'\b{f}\(\)',b,out)
+            return out
+        except: return code
+    def refactor_python(self,code):
+        r=self.rules.get('python',{});
+        if r.get('remove_unused_imports'): code=self.remove_unused_imports_python(code)
+        if r.get('inline_simple_functions'): code=self.inline_simple_functions_python(code)
+        if r.get('format'): code=self.format_python(code)
+        return code
+    def convert_java_for_each(self,code):
+        return re.sub(r'for \(int (\w+)=0; \1 < (\w+).size\(\); \1\+\+\)', r'for (var x : \2)', code)
+    def refactor_java(self,code):
+        r=self.rules.get('java',{})
+        if r.get('convert_for_each'): code=self.convert_java_for_each(code)
+        return code
+    def refactor_javascript(self,code):
+        r=self.rules.get('javascript',{})
+        if r.get('convert_var_to_let'): code=code.replace('var ','let ')
+        return code
+    def refactor(self,code,lang):
+        lang=lang.lower()
+        if lang=='python': return self.refactor_python(code)
+        if lang=='java': return self.refactor_java(code)
+        if lang in ('js','javascript'): return self.refactor_javascript(code)
+        return code

src/universal_refactor/tokenizer.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from transformers import AutoTokenizer
+def get_tokenizer(name='Salesforce/codegen-350M-multi'):
+    try:
+        return AutoTokenizer.from_pretrained(name)
+    except Exception:
+        return AutoTokenizer.from_pretrained('gpt2')

src/universal_refactor/utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os, subprocess
+def read_text(path):
+    with open(path,'r',encoding='utf-8',errors='ignore') as f: return f.read()
+def write_text(path,text):
+    os.makedirs(os.path.dirname(path) or '.',exist_ok=True)
+    with open(path,'w',encoding='utf-8') as f: f.write(text)
+def run_cmd(cmd,cwd=None):
+    p=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE,cwd=cwd)
+    o,e=p.communicate();return p.returncode,o.decode(),e.decode()

tests/test_python_refactor.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from src.universal_refactor.refactor_engine import RefactorEngine
2	+ ...

training/distributed/run_deepspeed.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ echo run ds

training/distributed/slurm_job.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ echo slurm

training/distributed/zero3_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

training/finetune_bugfix.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # bugfix placeholder

training/finetune_convert.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # convert placeholder

training/finetune_refactor.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # finetune placeholder

training/long_context_training.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # long context placeholder

training/pretrain.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # pretrain placeholder

training/tokenizer_training.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # tokenizer placeholder