Upload 40 files
Browse files- MODEL_CARD.md +2 -0
- README.md +2 -3
- config/refactor_rules.yaml +12 -0
- datasets/README.md +1 -0
- datasets/code_repo_raw/README.txt +1 -0
- datasets/metadata.json +1 -0
- datasets/multilingual_code_clean/README.txt +1 -0
- deployment/Dockerfile +6 -0
- deployment/huggingface_spaces/README.md +1 -0
- deployment/huggingface_spaces/app.py +2 -0
- deployment/huggingface_spaces/requirements.txt +1 -0
- evaluation/evaluate.py +12 -0
- inference/api_server.py +0 -0
- inference/cli.py +18 -0
- inference/gradio_app.py +2 -0
- notes.md +1 -0
- requirements.txt +14 -0
- setup.cfg +2 -0
- src/universal_refactor/__init__.py +1 -0
- src/universal_refactor/bug_detector.py +37 -0
- src/universal_refactor/code_converter.py +25 -0
- src/universal_refactor/embeddings/ast_embeddings.py +1 -0
- src/universal_refactor/embeddings/code_embeddings.py +1 -0
- src/universal_refactor/long_context_manager.py +1 -0
- src/universal_refactor/model.py +41 -0
- src/universal_refactor/patch_generator.py +8 -0
- src/universal_refactor/pipelines.py +21 -0
- src/universal_refactor/refactor_engine.py +47 -0
- src/universal_refactor/tokenizer.py +7 -0
- src/universal_refactor/utils.py +12 -0
- tests/test_python_refactor.py +2 -0
- training/distributed/run_deepspeed.sh +1 -0
- training/distributed/slurm_job.sh +1 -0
- training/distributed/zero3_config.json +1 -0
- training/finetune_bugfix.py +1 -0
- training/finetune_convert.py +1 -0
- training/finetune_refactor.py +1 -0
- training/long_context_training.py +1 -0
- training/pretrain.py +1 -0
- training/tokenizer_training.py +1 -0
MODEL_CARD.md
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model Card
|
| 2 |
+
...
|
README.md
CHANGED
|
@@ -1,3 +1,2 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
---
|
|
|
|
| 1 |
+
# Universal-Code-Refactor-32B (Compact Full Implementation)
|
| 2 |
+
This repo contains a compact yet real implementation...
|
|
|
config/refactor_rules.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python:
|
| 2 |
+
format: true
|
| 3 |
+
remove_unused_imports: true
|
| 4 |
+
inline_simple_functions: true
|
| 5 |
+
|
| 6 |
+
java:
|
| 7 |
+
format: true
|
| 8 |
+
convert_for_each: true
|
| 9 |
+
|
| 10 |
+
javascript:
|
| 11 |
+
format: false
|
| 12 |
+
convert_var_to_let: true
|
datasets/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# dataset readme
|
datasets/code_repo_raw/README.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
raw
|
datasets/metadata.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"description":"meta"}
|
datasets/multilingual_code_clean/README.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
clean
|
deployment/Dockerfile
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
COPY . /app
|
| 4 |
+
RUN pip install -r requirements.txt
|
| 5 |
+
EXPOSE 8000
|
| 6 |
+
CMD ['uvicorn','inference.api_server:app','--host','0.0.0.0','--port','8000']
|
deployment/huggingface_spaces/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
HF space readme
|
deployment/huggingface_spaces/app.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
...
|
deployment/huggingface_spaces/requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
gradio
|
evaluation/evaluate.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from radon.complexity import cc_visit
|
| 2 |
+
from src.universal_refactor.patch_generator import PatchGenerator
|
| 3 |
+
def compute_complexity(c):
|
| 4 |
+
try: return sum(b.complexity for b in cc_visit(c))
|
| 5 |
+
except: return 0
|
| 6 |
+
def evaluate_pair(a,b):
|
| 7 |
+
diff=PatchGenerator.unified_diff(a,b)
|
| 8 |
+
oc,nc=compute_complexity(a),compute_complexity(b)
|
| 9 |
+
red=(oc-nc)/oc if oc else 0
|
| 10 |
+
ch=sum(1 for l in diff.splitlines() if l.startswith('+') or l.startswith('-'))
|
| 11 |
+
clean=max(0,1-min(ch/400,1))
|
| 12 |
+
return {'complexity_reduction':red,'patch_cleanliness':clean,'diff':diff}
|
inference/api_server.py
ADDED
|
Binary file (1.65 kB). View file
|
|
|
inference/cli.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from src.universal_refactor.pipelines import InferencePipeline
|
| 3 |
+
from src.universal_refactor.utils import read_text
|
| 4 |
+
|
| 5 |
+
parser=argparse.ArgumentParser()
|
| 6 |
+
parser.add_argument('--mode',choices=['refactor','convert'],default='refactor')
|
| 7 |
+
parser.add_argument('--file');parser.add_argument('--lang',default='python')
|
| 8 |
+
parser.add_argument('--src');parser.add_argument('--tgt')
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
a=parser.parse_args();pipe=InferencePipeline()
|
| 12 |
+
code=read_text(a.file)
|
| 13 |
+
if a.mode=='refactor':
|
| 14 |
+
out=pipe.analyze_and_refactor(code,a.lang);print(out)
|
| 15 |
+
else:
|
| 16 |
+
print(pipe.convert(code,a.src,a.tgt))
|
| 17 |
+
|
| 18 |
+
if __name__=='__main__': main()
|
inference/gradio_app.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
...
|
notes.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Developer Notes
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
transformers
|
| 4 |
+
tokenizers
|
| 5 |
+
torch>=1.13
|
| 6 |
+
javalang
|
| 7 |
+
black
|
| 8 |
+
isort
|
| 9 |
+
radon
|
| 10 |
+
pyyaml
|
| 11 |
+
gitpython
|
| 12 |
+
python-Levenshtein
|
| 13 |
+
gradio
|
| 14 |
+
requests
|
setup.cfg
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[flake8]
|
| 2 |
+
max-line-length=120
|
src/universal_refactor/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__all__ = ['RefactorEngine','BugDetector','CodeConverter','PatchGenerator','InferencePipeline','LongContextManager']
|
src/universal_refactor/bug_detector.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast, javalang, re
|
| 2 |
+
class BugDetector:
|
| 3 |
+
def detect_none_deref_python(self,code):
|
| 4 |
+
issues=[];
|
| 5 |
+
try:
|
| 6 |
+
t=ast.parse(code)
|
| 7 |
+
for n in ast.walk(t):
|
| 8 |
+
if isinstance(n,ast.Attribute) and isinstance(n.value,ast.Name):
|
| 9 |
+
issues.append({'line':n.lineno,'msg':f"Possible None dereference: {n.value.id}"})
|
| 10 |
+
except: pass
|
| 11 |
+
return issues
|
| 12 |
+
def detect_unused_vars_python(self,code):
|
| 13 |
+
issues=[]
|
| 14 |
+
try:
|
| 15 |
+
t=ast.parse(code);assigned=set();used=set()
|
| 16 |
+
for n in ast.walk(t):
|
| 17 |
+
if isinstance(n,ast.Assign):
|
| 18 |
+
for t2 in n.targets:
|
| 19 |
+
if isinstance(t2,ast.Name): assigned.add(t2.id)
|
| 20 |
+
if isinstance(n,ast.Name) and isinstance(n.ctx,ast.Load): used.add(n.id)
|
| 21 |
+
for v in assigned-used: issues.append({'msg':f"Unused variable {v}"})
|
| 22 |
+
except: pass
|
| 23 |
+
return issues
|
| 24 |
+
def detect_null_checks_java(self,code):
|
| 25 |
+
issues=[]
|
| 26 |
+
try:
|
| 27 |
+
for m in re.finditer(r"(\w+)\.\w+\(",code):
|
| 28 |
+
v=m.group(1);ctx=code[max(0,m.start()-200):m.end()+200]
|
| 29 |
+
if not re.search(rf"if *\( *{v} *!= *null",ctx):
|
| 30 |
+
issues.append({'msg':f"Possible null dereference: {v}"})
|
| 31 |
+
except: pass
|
| 32 |
+
return issues
|
| 33 |
+
def analyze(self,code,lang):
|
| 34 |
+
lang=lang.lower()
|
| 35 |
+
if lang=='python': return self.detect_none_deref_python(code)+self.detect_unused_vars_python(code)
|
| 36 |
+
if lang=='java': return self.detect_null_checks_java(code)
|
| 37 |
+
return []
|
src/universal_refactor/code_converter.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast, astor, javalang
|
| 2 |
+
class CodeConverter:
|
| 3 |
+
def python_to_java(self,code):
|
| 4 |
+
try: t=ast.parse(code)
|
| 5 |
+
except: return "// parse error\n"+code
|
| 6 |
+
out=["public class Converted {"," public static void main(String[] args) {"]
|
| 7 |
+
|
| 8 |
+
for n in t.body:
|
| 9 |
+
if isinstance(n,ast.Expr) and isinstance(n.value,ast.Call) and getattr(n.value.func,'id','')=='print':
|
| 10 |
+
args=", ".join(astor.to_source(a).strip() for a in n.value.args)
|
| 11 |
+
out.append(f" System.out.println({args});")
|
| 12 |
+
out.append(" }"); out.append("}")
|
| 13 |
+
return "\n".join(out)
|
| 14 |
+
def java_to_python(self,code):
|
| 15 |
+
out=["# Converted from Java"]
|
| 16 |
+
try: tree=javalang.parse.parse(code)
|
| 17 |
+
except: return "# parse error\n"+code
|
| 18 |
+
for _,m in tree.filter(javalang.tree.MethodDeclaration):
|
| 19 |
+
out.append(f"def {m.name}():\n pass")
|
| 20 |
+
return "\n".join(out)
|
| 21 |
+
def convert(self,code,src,tgt):
|
| 22 |
+
s,t=src.lower(),tgt.lower()
|
| 23 |
+
if s=='python' and t=='java': return self.python_to_java(code)
|
| 24 |
+
if s=='java' and t=='python': return self.java_to_python(code)
|
| 25 |
+
return code
|
src/universal_refactor/embeddings/ast_embeddings.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
def ast_features(c): return {}
|
src/universal_refactor/embeddings/code_embeddings.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
class CodeEmbedder: pass
|
src/universal_refactor/long_context_manager.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
class LongContextManager: pass
|
src/universal_refactor/model.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
class PositionalEncoding(nn.Module):
|
| 6 |
+
def __init__(self, d_model, max_len=8192):
|
| 7 |
+
super().__init__()
|
| 8 |
+
pe = torch.zeros(max_len, d_model)
|
| 9 |
+
position = torch.arange(0, max_len).unsqueeze(1).float()
|
| 10 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
|
| 11 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
| 12 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
| 13 |
+
self.register_buffer("pe", pe)
|
| 14 |
+
|
| 15 |
+
def forward(self, x):
|
| 16 |
+
t = x.size(1)
|
| 17 |
+
return x + self.pe[:t].unsqueeze(0)
|
| 18 |
+
|
| 19 |
+
class SmallCodeTransformer(nn.Module):
|
| 20 |
+
def __init__(self, vocab_size, d_model=512, nhead=8, nlayers=6, dim_feed=2048, max_len=8192):
|
| 21 |
+
super().__init__()
|
| 22 |
+
self.token_emb = nn.Embedding(vocab_size, d_model)
|
| 23 |
+
self.pos = PositionalEncoding(d_model, max_len)
|
| 24 |
+
encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feed, dropout=0.1, activation="gelu")
|
| 25 |
+
self.encoder = nn.TransformerEncoder(encoder_layer, nlayers)
|
| 26 |
+
self.ln = nn.LayerNorm(d_model)
|
| 27 |
+
self.head = nn.Linear(d_model, vocab_size, bias=False)
|
| 28 |
+
self._init_weights()
|
| 29 |
+
|
| 30 |
+
def _init_weights(self):
|
| 31 |
+
nn.init.normal_(self.token_emb.weight, mean=0.0, std=0.02)
|
| 32 |
+
nn.init.normal_(self.head.weight, mean=0.0, std=0.02)
|
| 33 |
+
|
| 34 |
+
def forward(self, input_ids, attention_mask=None):
|
| 35 |
+
x = self.token_emb(input_ids)
|
| 36 |
+
x = self.pos(x)
|
| 37 |
+
x = x.permute(1,0,2)
|
| 38 |
+
x = self.encoder(x, src_key_padding_mask=(attention_mask==0) if attention_mask is not None else None)
|
| 39 |
+
x = x.permute(1,0,2)
|
| 40 |
+
x = self.ln(x)
|
| 41 |
+
return self.head(x)
|
src/universal_refactor/patch_generator.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import difflib
|
| 2 |
+
class PatchGenerator:
|
| 3 |
+
@staticmethod
|
| 4 |
+
def unified_diff(a,b,filename='file'):
|
| 5 |
+
return ''.join(difflib.unified_diff(a.splitlines(True),b.splitlines(True),fromfile=filename,tofile=filename+'.refactored'))
|
| 6 |
+
@staticmethod
|
| 7 |
+
def summarize_patch(d,maxl=20):
|
| 8 |
+
return '\n'.join(d.splitlines()[:maxl])
|
src/universal_refactor/pipelines.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .refactor_engine import RefactorEngine
|
| 2 |
+
from .bug_detector import BugDetector
|
| 3 |
+
from .code_converter import CodeConverter
|
| 4 |
+
from .patch_generator import PatchGenerator
|
| 5 |
+
from .tokenizer import get_tokenizer
|
| 6 |
+
from .model import SmallCodeTransformer
|
| 7 |
+
class InferencePipeline:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.refactor=RefactorEngine()
|
| 10 |
+
self.bugs=BugDetector()
|
| 11 |
+
self.convert_engine=CodeConverter()
|
| 12 |
+
self.patch=PatchGenerator()
|
| 13 |
+
self.tokenizer=get_tokenizer()
|
| 14 |
+
self.model=SmallCodeTransformer(vocab_size=self.tokenizer.vocab_size)
|
| 15 |
+
def analyze_and_refactor(self,code,lang):
|
| 16 |
+
issues=self.bugs.analyze(code,lang)
|
| 17 |
+
ref=self.refactor.refactor(code,lang)
|
| 18 |
+
diff=self.patch.unified_diff(code,ref,f"code.{lang}")
|
| 19 |
+
return {'issues':issues,'refactored':ref,'diff':diff}
|
| 20 |
+
def convert(self,code,src,tgt):
|
| 21 |
+
return {'converted':self.convert_engine.convert(code,src,tgt)}
|
src/universal_refactor/refactor_engine.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast, astor, javalang, re, yaml, black, isort
|
| 2 |
+
from .utils import run_cmd
|
| 3 |
+
class RefactorEngine:
|
| 4 |
+
def __init__(self, rules_path='config/refactor_rules.yaml'):
|
| 5 |
+
with open(rules_path) as f: self.rules=yaml.safe_load(f)
|
| 6 |
+
def format_python(self,code):
|
| 7 |
+
try: return isort.code(black.format_str(code,mode=black.FileMode()))
|
| 8 |
+
except: return code
|
| 9 |
+
def remove_unused_imports_python(self,code):
|
| 10 |
+
try:
|
| 11 |
+
t=ast.parse(code);im=[n for n in t.body if isinstance(n,(ast.Import,ast.ImportFrom))]
|
| 12 |
+
used={n.id for n in ast.walk(t) if isinstance(n,ast.Name)}
|
| 13 |
+
keep=[i for i in im if any((a.asname or a.name.split('.')[0]) in used for a in i.names)]
|
| 14 |
+
t.body=keep+[n for n in t.body if n not in im];return astor.to_source(t)
|
| 15 |
+
except: return code
|
| 16 |
+
def inline_simple_functions_python(self,code):
|
| 17 |
+
try:
|
| 18 |
+
t=ast.parse(code);funcs={}
|
| 19 |
+
for n in t.body:
|
| 20 |
+
if isinstance(n,ast.FunctionDef) and len(n.body)==1 and isinstance(n.body[0],ast.Return):
|
| 21 |
+
funcs[n.name]=astor.to_source(n.body[0].value).strip()
|
| 22 |
+
out=code
|
| 23 |
+
for f,b in funcs.items(): out=re.sub(rf'\b{f}\(\)',b,out)
|
| 24 |
+
return out
|
| 25 |
+
except: return code
|
| 26 |
+
def refactor_python(self,code):
|
| 27 |
+
r=self.rules.get('python',{});
|
| 28 |
+
if r.get('remove_unused_imports'): code=self.remove_unused_imports_python(code)
|
| 29 |
+
if r.get('inline_simple_functions'): code=self.inline_simple_functions_python(code)
|
| 30 |
+
if r.get('format'): code=self.format_python(code)
|
| 31 |
+
return code
|
| 32 |
+
def convert_java_for_each(self,code):
|
| 33 |
+
return re.sub(r'for \(int (\w+)=0; \1 < (\w+).size\(\); \1\+\+\)', r'for (var x : \2)', code)
|
| 34 |
+
def refactor_java(self,code):
|
| 35 |
+
r=self.rules.get('java',{})
|
| 36 |
+
if r.get('convert_for_each'): code=self.convert_java_for_each(code)
|
| 37 |
+
return code
|
| 38 |
+
def refactor_javascript(self,code):
|
| 39 |
+
r=self.rules.get('javascript',{})
|
| 40 |
+
if r.get('convert_var_to_let'): code=code.replace('var ','let ')
|
| 41 |
+
return code
|
| 42 |
+
def refactor(self,code,lang):
|
| 43 |
+
lang=lang.lower()
|
| 44 |
+
if lang=='python': return self.refactor_python(code)
|
| 45 |
+
if lang=='java': return self.refactor_java(code)
|
| 46 |
+
if lang in ('js','javascript'): return self.refactor_javascript(code)
|
| 47 |
+
return code
|
src/universal_refactor/tokenizer.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
| 2 |
+
|
| 3 |
+
def get_tokenizer(name='Salesforce/codegen-350M-multi'):
|
| 4 |
+
try:
|
| 5 |
+
return AutoTokenizer.from_pretrained(name)
|
| 6 |
+
except Exception:
|
| 7 |
+
return AutoTokenizer.from_pretrained('gpt2')
|
src/universal_refactor/utils.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, subprocess
|
| 2 |
+
|
| 3 |
+
def read_text(path):
|
| 4 |
+
with open(path,'r',encoding='utf-8',errors='ignore') as f: return f.read()
|
| 5 |
+
|
| 6 |
+
def write_text(path,text):
|
| 7 |
+
os.makedirs(os.path.dirname(path) or '.',exist_ok=True)
|
| 8 |
+
with open(path,'w',encoding='utf-8') as f: f.write(text)
|
| 9 |
+
|
| 10 |
+
def run_cmd(cmd,cwd=None):
|
| 11 |
+
p=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE,cwd=cwd)
|
| 12 |
+
o,e=p.communicate();return p.returncode,o.decode(),e.decode()
|
tests/test_python_refactor.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.universal_refactor.refactor_engine import RefactorEngine
|
| 2 |
+
...
|
training/distributed/run_deepspeed.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
echo run ds
|
training/distributed/slurm_job.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
echo slurm
|
training/distributed/zero3_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{}
|
training/finetune_bugfix.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# bugfix placeholder
|
training/finetune_convert.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# convert placeholder
|
training/finetune_refactor.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# finetune placeholder
|
training/long_context_training.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# long context placeholder
|
training/pretrain.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# pretrain placeholder
|
training/tokenizer_training.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# tokenizer placeholder
|