hmnshudhmn24 commited on
Commit
b19c92c
·
verified ·
1 Parent(s): 7e00db6

Upload 40 files

Browse files
Files changed (40) hide show
  1. MODEL_CARD.md +2 -0
  2. README.md +2 -3
  3. config/refactor_rules.yaml +12 -0
  4. datasets/README.md +1 -0
  5. datasets/code_repo_raw/README.txt +1 -0
  6. datasets/metadata.json +1 -0
  7. datasets/multilingual_code_clean/README.txt +1 -0
  8. deployment/Dockerfile +6 -0
  9. deployment/huggingface_spaces/README.md +1 -0
  10. deployment/huggingface_spaces/app.py +2 -0
  11. deployment/huggingface_spaces/requirements.txt +1 -0
  12. evaluation/evaluate.py +12 -0
  13. inference/api_server.py +0 -0
  14. inference/cli.py +18 -0
  15. inference/gradio_app.py +2 -0
  16. notes.md +1 -0
  17. requirements.txt +14 -0
  18. setup.cfg +2 -0
  19. src/universal_refactor/__init__.py +1 -0
  20. src/universal_refactor/bug_detector.py +37 -0
  21. src/universal_refactor/code_converter.py +25 -0
  22. src/universal_refactor/embeddings/ast_embeddings.py +1 -0
  23. src/universal_refactor/embeddings/code_embeddings.py +1 -0
  24. src/universal_refactor/long_context_manager.py +1 -0
  25. src/universal_refactor/model.py +41 -0
  26. src/universal_refactor/patch_generator.py +8 -0
  27. src/universal_refactor/pipelines.py +21 -0
  28. src/universal_refactor/refactor_engine.py +47 -0
  29. src/universal_refactor/tokenizer.py +7 -0
  30. src/universal_refactor/utils.py +12 -0
  31. tests/test_python_refactor.py +2 -0
  32. training/distributed/run_deepspeed.sh +1 -0
  33. training/distributed/slurm_job.sh +1 -0
  34. training/distributed/zero3_config.json +1 -0
  35. training/finetune_bugfix.py +1 -0
  36. training/finetune_convert.py +1 -0
  37. training/finetune_refactor.py +1 -0
  38. training/long_context_training.py +1 -0
  39. training/pretrain.py +1 -0
  40. training/tokenizer_training.py +1 -0
MODEL_CARD.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Model Card
2
+ ...
README.md CHANGED
@@ -1,3 +1,2 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
1
+ # Universal-Code-Refactor-32B (Compact Full Implementation)
2
+ This repo contains a compact yet real implementation...
 
config/refactor_rules.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python:
2
+ format: true
3
+ remove_unused_imports: true
4
+ inline_simple_functions: true
5
+
6
+ java:
7
+ format: true
8
+ convert_for_each: true
9
+
10
+ javascript:
11
+ format: false
12
+ convert_var_to_let: true
datasets/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # dataset readme
datasets/code_repo_raw/README.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
datasets/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"description":"meta"}
datasets/multilingual_code_clean/README.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ clean
deployment/Dockerfile ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ WORKDIR /app
3
+ COPY . /app
4
+ RUN pip install -r requirements.txt
5
+ EXPOSE 8000
6
+ CMD ['uvicorn','inference.api_server:app','--host','0.0.0.0','--port','8000']
deployment/huggingface_spaces/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ HF space readme
deployment/huggingface_spaces/app.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import gradio as gr
2
+ ...
deployment/huggingface_spaces/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio
evaluation/evaluate.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from radon.complexity import cc_visit
2
+ from src.universal_refactor.patch_generator import PatchGenerator
3
+ def compute_complexity(c):
4
+ try: return sum(b.complexity for b in cc_visit(c))
5
+ except: return 0
6
+ def evaluate_pair(a,b):
7
+ diff=PatchGenerator.unified_diff(a,b)
8
+ oc,nc=compute_complexity(a),compute_complexity(b)
9
+ red=(oc-nc)/oc if oc else 0
10
+ ch=sum(1 for l in diff.splitlines() if l.startswith('+') or l.startswith('-'))
11
+ clean=max(0,1-min(ch/400,1))
12
+ return {'complexity_reduction':red,'patch_cleanliness':clean,'diff':diff}
inference/api_server.py ADDED
Binary file (1.65 kB). View file
 
inference/cli.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from src.universal_refactor.pipelines import InferencePipeline
3
+ from src.universal_refactor.utils import read_text
4
+
5
+ parser=argparse.ArgumentParser()
6
+ parser.add_argument('--mode',choices=['refactor','convert'],default='refactor')
7
+ parser.add_argument('--file');parser.add_argument('--lang',default='python')
8
+ parser.add_argument('--src');parser.add_argument('--tgt')
9
+
10
+ def main():
11
+ a=parser.parse_args();pipe=InferencePipeline()
12
+ code=read_text(a.file)
13
+ if a.mode=='refactor':
14
+ out=pipe.analyze_and_refactor(code,a.lang);print(out)
15
+ else:
16
+ print(pipe.convert(code,a.src,a.tgt))
17
+
18
+ if __name__=='__main__': main()
inference/gradio_app.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import gradio as gr
2
+ ...
notes.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Developer Notes
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ transformers
4
+ tokenizers
5
+ torch>=1.13
6
+ javalang
7
+ black
8
+ isort
9
+ radon
10
+ pyyaml
11
+ gitpython
12
+ python-Levenshtein
13
+ gradio
14
+ requests
setup.cfg ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [flake8]
2
+ max-line-length=120
src/universal_refactor/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __all__ = ['RefactorEngine','BugDetector','CodeConverter','PatchGenerator','InferencePipeline','LongContextManager']
src/universal_refactor/bug_detector.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast, javalang, re
2
+ class BugDetector:
3
+ def detect_none_deref_python(self,code):
4
+ issues=[];
5
+ try:
6
+ t=ast.parse(code)
7
+ for n in ast.walk(t):
8
+ if isinstance(n,ast.Attribute) and isinstance(n.value,ast.Name):
9
+ issues.append({'line':n.lineno,'msg':f"Possible None dereference: {n.value.id}"})
10
+ except: pass
11
+ return issues
12
+ def detect_unused_vars_python(self,code):
13
+ issues=[]
14
+ try:
15
+ t=ast.parse(code);assigned=set();used=set()
16
+ for n in ast.walk(t):
17
+ if isinstance(n,ast.Assign):
18
+ for t2 in n.targets:
19
+ if isinstance(t2,ast.Name): assigned.add(t2.id)
20
+ if isinstance(n,ast.Name) and isinstance(n.ctx,ast.Load): used.add(n.id)
21
+ for v in assigned-used: issues.append({'msg':f"Unused variable {v}"})
22
+ except: pass
23
+ return issues
24
+ def detect_null_checks_java(self,code):
25
+ issues=[]
26
+ try:
27
+ for m in re.finditer(r"(\w+)\.\w+\(",code):
28
+ v=m.group(1);ctx=code[max(0,m.start()-200):m.end()+200]
29
+ if not re.search(rf"if *\( *{v} *!= *null",ctx):
30
+ issues.append({'msg':f"Possible null dereference: {v}"})
31
+ except: pass
32
+ return issues
33
+ def analyze(self,code,lang):
34
+ lang=lang.lower()
35
+ if lang=='python': return self.detect_none_deref_python(code)+self.detect_unused_vars_python(code)
36
+ if lang=='java': return self.detect_null_checks_java(code)
37
+ return []
src/universal_refactor/code_converter.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast, astor, javalang
2
+ class CodeConverter:
3
+ def python_to_java(self,code):
4
+ try: t=ast.parse(code)
5
+ except: return "// parse error\n"+code
6
+ out=["public class Converted {"," public static void main(String[] args) {"]
7
+
8
+ for n in t.body:
9
+ if isinstance(n,ast.Expr) and isinstance(n.value,ast.Call) and getattr(n.value.func,'id','')=='print':
10
+ args=", ".join(astor.to_source(a).strip() for a in n.value.args)
11
+ out.append(f" System.out.println({args});")
12
+ out.append(" }"); out.append("}")
13
+ return "\n".join(out)
14
+ def java_to_python(self,code):
15
+ out=["# Converted from Java"]
16
+ try: tree=javalang.parse.parse(code)
17
+ except: return "# parse error\n"+code
18
+ for _,m in tree.filter(javalang.tree.MethodDeclaration):
19
+ out.append(f"def {m.name}():\n pass")
20
+ return "\n".join(out)
21
+ def convert(self,code,src,tgt):
22
+ s,t=src.lower(),tgt.lower()
23
+ if s=='python' and t=='java': return self.python_to_java(code)
24
+ if s=='java' and t=='python': return self.java_to_python(code)
25
+ return code
src/universal_refactor/embeddings/ast_embeddings.py ADDED
@@ -0,0 +1 @@
 
 
1
+ def ast_features(c): return {}
src/universal_refactor/embeddings/code_embeddings.py ADDED
@@ -0,0 +1 @@
 
 
1
+ class CodeEmbedder: pass
src/universal_refactor/long_context_manager.py ADDED
@@ -0,0 +1 @@
 
 
1
+ class LongContextManager: pass
src/universal_refactor/model.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ class PositionalEncoding(nn.Module):
6
+ def __init__(self, d_model, max_len=8192):
7
+ super().__init__()
8
+ pe = torch.zeros(max_len, d_model)
9
+ position = torch.arange(0, max_len).unsqueeze(1).float()
10
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
11
+ pe[:, 0::2] = torch.sin(position * div_term)
12
+ pe[:, 1::2] = torch.cos(position * div_term)
13
+ self.register_buffer("pe", pe)
14
+
15
+ def forward(self, x):
16
+ t = x.size(1)
17
+ return x + self.pe[:t].unsqueeze(0)
18
+
19
+ class SmallCodeTransformer(nn.Module):
20
+ def __init__(self, vocab_size, d_model=512, nhead=8, nlayers=6, dim_feed=2048, max_len=8192):
21
+ super().__init__()
22
+ self.token_emb = nn.Embedding(vocab_size, d_model)
23
+ self.pos = PositionalEncoding(d_model, max_len)
24
+ encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feed, dropout=0.1, activation="gelu")
25
+ self.encoder = nn.TransformerEncoder(encoder_layer, nlayers)
26
+ self.ln = nn.LayerNorm(d_model)
27
+ self.head = nn.Linear(d_model, vocab_size, bias=False)
28
+ self._init_weights()
29
+
30
+ def _init_weights(self):
31
+ nn.init.normal_(self.token_emb.weight, mean=0.0, std=0.02)
32
+ nn.init.normal_(self.head.weight, mean=0.0, std=0.02)
33
+
34
+ def forward(self, input_ids, attention_mask=None):
35
+ x = self.token_emb(input_ids)
36
+ x = self.pos(x)
37
+ x = x.permute(1,0,2)
38
+ x = self.encoder(x, src_key_padding_mask=(attention_mask==0) if attention_mask is not None else None)
39
+ x = x.permute(1,0,2)
40
+ x = self.ln(x)
41
+ return self.head(x)
src/universal_refactor/patch_generator.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import difflib
2
+ class PatchGenerator:
3
+ @staticmethod
4
+ def unified_diff(a,b,filename='file'):
5
+ return ''.join(difflib.unified_diff(a.splitlines(True),b.splitlines(True),fromfile=filename,tofile=filename+'.refactored'))
6
+ @staticmethod
7
+ def summarize_patch(d,maxl=20):
8
+ return '\n'.join(d.splitlines()[:maxl])
src/universal_refactor/pipelines.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .refactor_engine import RefactorEngine
2
+ from .bug_detector import BugDetector
3
+ from .code_converter import CodeConverter
4
+ from .patch_generator import PatchGenerator
5
+ from .tokenizer import get_tokenizer
6
+ from .model import SmallCodeTransformer
7
+ class InferencePipeline:
8
+ def __init__(self):
9
+ self.refactor=RefactorEngine()
10
+ self.bugs=BugDetector()
11
+ self.convert_engine=CodeConverter()
12
+ self.patch=PatchGenerator()
13
+ self.tokenizer=get_tokenizer()
14
+ self.model=SmallCodeTransformer(vocab_size=self.tokenizer.vocab_size)
15
+ def analyze_and_refactor(self,code,lang):
16
+ issues=self.bugs.analyze(code,lang)
17
+ ref=self.refactor.refactor(code,lang)
18
+ diff=self.patch.unified_diff(code,ref,f"code.{lang}")
19
+ return {'issues':issues,'refactored':ref,'diff':diff}
20
+ def convert(self,code,src,tgt):
21
+ return {'converted':self.convert_engine.convert(code,src,tgt)}
src/universal_refactor/refactor_engine.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast, astor, javalang, re, yaml, black, isort
2
+ from .utils import run_cmd
3
+ class RefactorEngine:
4
+ def __init__(self, rules_path='config/refactor_rules.yaml'):
5
+ with open(rules_path) as f: self.rules=yaml.safe_load(f)
6
+ def format_python(self,code):
7
+ try: return isort.code(black.format_str(code,mode=black.FileMode()))
8
+ except: return code
9
+ def remove_unused_imports_python(self,code):
10
+ try:
11
+ t=ast.parse(code);im=[n for n in t.body if isinstance(n,(ast.Import,ast.ImportFrom))]
12
+ used={n.id for n in ast.walk(t) if isinstance(n,ast.Name)}
13
+ keep=[i for i in im if any((a.asname or a.name.split('.')[0]) in used for a in i.names)]
14
+ t.body=keep+[n for n in t.body if n not in im];return astor.to_source(t)
15
+ except: return code
16
+ def inline_simple_functions_python(self,code):
17
+ try:
18
+ t=ast.parse(code);funcs={}
19
+ for n in t.body:
20
+ if isinstance(n,ast.FunctionDef) and len(n.body)==1 and isinstance(n.body[0],ast.Return):
21
+ funcs[n.name]=astor.to_source(n.body[0].value).strip()
22
+ out=code
23
+ for f,b in funcs.items(): out=re.sub(rf'\b{f}\(\)',b,out)
24
+ return out
25
+ except: return code
26
+ def refactor_python(self,code):
27
+ r=self.rules.get('python',{});
28
+ if r.get('remove_unused_imports'): code=self.remove_unused_imports_python(code)
29
+ if r.get('inline_simple_functions'): code=self.inline_simple_functions_python(code)
30
+ if r.get('format'): code=self.format_python(code)
31
+ return code
32
+ def convert_java_for_each(self,code):
33
+ return re.sub(r'for \(int (\w+)=0; \1 < (\w+).size\(\); \1\+\+\)', r'for (var x : \2)', code)
34
+ def refactor_java(self,code):
35
+ r=self.rules.get('java',{})
36
+ if r.get('convert_for_each'): code=self.convert_java_for_each(code)
37
+ return code
38
+ def refactor_javascript(self,code):
39
+ r=self.rules.get('javascript',{})
40
+ if r.get('convert_var_to_let'): code=code.replace('var ','let ')
41
+ return code
42
+ def refactor(self,code,lang):
43
+ lang=lang.lower()
44
+ if lang=='python': return self.refactor_python(code)
45
+ if lang=='java': return self.refactor_java(code)
46
+ if lang in ('js','javascript'): return self.refactor_javascript(code)
47
+ return code
src/universal_refactor/tokenizer.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ def get_tokenizer(name='Salesforce/codegen-350M-multi'):
4
+ try:
5
+ return AutoTokenizer.from_pretrained(name)
6
+ except Exception:
7
+ return AutoTokenizer.from_pretrained('gpt2')
src/universal_refactor/utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, subprocess
2
+
3
+ def read_text(path):
4
+ with open(path,'r',encoding='utf-8',errors='ignore') as f: return f.read()
5
+
6
+ def write_text(path,text):
7
+ os.makedirs(os.path.dirname(path) or '.',exist_ok=True)
8
+ with open(path,'w',encoding='utf-8') as f: f.write(text)
9
+
10
+ def run_cmd(cmd,cwd=None):
11
+ p=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE,cwd=cwd)
12
+ o,e=p.communicate();return p.returncode,o.decode(),e.decode()
tests/test_python_refactor.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from src.universal_refactor.refactor_engine import RefactorEngine
2
+ ...
training/distributed/run_deepspeed.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ echo run ds
training/distributed/slurm_job.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ echo slurm
training/distributed/zero3_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
training/finetune_bugfix.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # bugfix placeholder
training/finetune_convert.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # convert placeholder
training/finetune_refactor.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # finetune placeholder
training/long_context_training.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # long context placeholder
training/pretrain.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # pretrain placeholder
training/tokenizer_training.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # tokenizer placeholder