# MiniCoderX Project - Full Pipeline Notebook

# Step 0: Environment Setup

In [None]:
pip install -q tokenizers transformers datasets sentencepiece langchain_community ollama networkx evaluate rouge_score matplotlib seaborn lark fastapi uvicorn

# Step 1: Import and Load Model

In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
import os

tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

trainer = trainers.BpeTrainer(
 vocab_size=32000,
 special_tokens=["", "", "", "", ""]
)

data_path = "data/code_corpus.txt"

if not os.path.exists(data_path):
 raise FileNotFoundError(f"Dataset not found at: {data_path}")
else:
 print("Dataset found:", data_path)

tokenizer.train([data_path], trainer)


tokenizer.post_processor = TemplateProcessing(
 single=" $A ",
 pair=" $A $B ",
 special_tokens=[
 ("", tokenizer.token_to_id("")),
 ("", tokenizer.token_to_id("")),
 ],
)

tokenizer_path = "minicoderx-tokenizer"
os.makedirs(tokenizer_path, exist_ok=True)
tokenizer.save(f"{tokenizer_path}/tokenizer.json")
print("Tokenizer saved to:", tokenizer_path)

In [None]:
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(
 tokenizer_file="minicoderx-tokenizer/tokenizer.json",
 unk_token="",
 pad_token="",
 cls_token="",
 sep_token="",
 mask_token="",
)

hf_tokenizer.save_pretrained("minicoderx-tokenizer")
print("HuggingFace tokenizer saved and ready.")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load your trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("minicoderx-model")
model = AutoModelForSeq2SeqLM.from_pretrained("minicoderx-model")

print("Model and tokenizer loaded.")

# Step 2: Inference - Code Generation

In [None]:
input_text = "Write a Python function to compute factorial"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=128)
print("\nGenerated Code:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Step 3: Structure-Aware Encoding with AST

In [None]:
import ast, networkx as nx, matplotlib.pyplot as plt, seaborn as sns

def build_ast_graph_with_metadata(node, graph, parent=None):
 node_id = str(id(node))
 graph.add_node(node_id, label=type(node).__name__)
 if parent:
 graph.add_edge(parent, node_id)
 for child in ast.iter_child_nodes(node):
 build_ast_graph_with_metadata(child, graph, node_id)

code_sample = """
def add(a, b):
 return a + b
"""
tree = ast.parse(code_sample)
G = nx.DiGraph()
build_ast_graph_with_metadata(tree, G)
pos = nx.spring_layout(G)
labels = nx.get_node_attributes(G, 'label')
nx.draw(G, pos, labels=labels, with_labels=True, node_size=1200, node_color='lightblue')
plt.title("AST Visualization")
plt.show()

# Step 4: LangChain + Ollama Integration

In [None]:
from langchain_community.llms import Ollama
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

llm = Ollama(model="minicoderx")
prompt = PromptTemplate(input_variables=["instruction"], template="Generate Python code for the task: {instruction}")
chain = LLMChain(llm=llm, prompt=prompt)
print("\nLangChain-Ollama Output:")
print(chain.run("Create a function to reverse a string"))

# Step 5: Evaluation (MBPP)

In [None]:
from datasets import load_dataset
import evaluate

dataset = load_dataset("mbpp")
eval_bleu = evaluate.load("bleu")
eval_rouge = evaluate.load("rouge")

sample = dataset['test'][0]
input_text = f"Write a Python function: {sample['text']}"
inputs = tokenizer(input_text, return_tensors="pt")
output = model.generate(**inputs, max_length=128)
generated_code = tokenizer.decode(output[0], skip_special_tokens=True)

print("\nEvaluation Sample Output:\n", generated_code)
print("BLEU:", eval_bleu.compute(predictions=[generated_code], references=[sample['code']]))
print("ROUGE:", eval_rouge.compute(predictions=[generated_code], references=[sample['code']]))

# Step 6: Testing, Verification, and Unit Test Gen

In [None]:
import tempfile, subprocess

def run_code(code, test_case):
 with tempfile.NamedTemporaryFile(mode='w+', suffix='.py', delete=False) as tmp:
 tmp.write(code + '\n' + test_case)
 tmp.flush()
 result = subprocess.run(['python', tmp.name], capture_output=True, text=True)
 print("Output:\n", result.stdout)
 if result.stderr:
 print("Errors:\n", result.stderr)

test_case = "print(factorial(5)) # Expected: 120"
run_code(generated_code, test_case)

unit_prompt = PromptTemplate(input_variables=["code"], template="Write a unittest in Python for the following function:\n\n{code}")
unit_chain = LLMChain(llm=llm, prompt=unit_prompt)
print("\nGenerated Unit Test:\n", unit_chain.run(code=generated_code))

# Step 7: Safety and Grammar Constraints

In [None]:
from lark import Lark, UnexpectedInput

python_grammar = """
start: stmt+
stmt: "def" NAME "(" [params] ")" ":" suite
params: NAME ("," NAME)*
suite: NEWLINE INDENT stmt+ DEDENT | simple_stmt
simple_stmt: NAME "=" expr NEWLINE
expr: atom | atom operator atom
atom: NAME | NUMBER
operator: "+" | "-" | "*" | "/"
%import common.CNAME -> NAME
%import common.NUMBER
%import common.NEWLINE
%import common.WS_INLINE
%import common.INDENT
%import common.DEDENT
%ignore WS_INLINE
"""

parser = Lark(python_grammar, parser="lalr")

unsafe_keywords = ["os.system", "subprocess", "eval", "exec", "open(", "import socket"]
print("\nSafety Check:")
print("Unsafe pattern found" if any(k in generated_code for k in unsafe_keywords) else "Code is safe")

print("\nGrammar Check:")
try:
 parser.parse(generated_code)
 print("Code grammar is valid.")
except UnexpectedInput as e:
 print("Grammar error:", e)

# Step 8: Multi-Task Preprocessing (gen, sum, trans)

In [None]:
def preprocess_multitask(example):
 if example['task'] == 'gen':
 input_text = f"Write code: {example['text']}"
 output_text = example['code']
 elif example['task'] == 'sum':
 input_text = f"Summarize this code: {example['code']}"
 output_text = example['text']
 elif example['task'] == 'trans':
 input_text = f"Translate Java to Python: {example['java']}"
 output_text = example['python']
 else:
 input_text, output_text = example['text'], example['code']
 model_input = tokenizer(input_text, max_length=128, truncation=True)
 with tokenizer.as_target_tokenizer():
 labels = tokenizer(output_text, max_length=128, truncation=True)
 model_input['labels'] = labels['input_ids']
 return model_input

# Step 9: Fine-Tuning Setup

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

train_dataset = dataset["train"].map(preprocess_multitask, remove_columns=dataset["train"].column_names)
val_dataset = dataset["validation"].map(preprocess_multitask, remove_columns=dataset["validation"].column_names)

training_args = Seq2SeqTrainingArguments(
 output_dir="./minicoderx-finetuned",
 evaluation_strategy="epoch",
 learning_rate=5e-5,
 per_device_train_batch_size=8,
 per_device_eval_batch_size=8,
 weight_decay=0.01,
 save_total_limit=2,
 num_train_epochs=3,
 predict_with_generate=True,
 logging_dir="./logs",
 logging_steps=10,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
 tokenizer=tokenizer,
 data_collator=data_collator,
)

In [None]:
# Uncomment to run training
# trainer.train()
# trainer.save_model("./minicoderx-finetuned")

# Step 10: Deploy with FastAPI

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn

app = FastAPI()

class CodeRequest(BaseModel):
 instruction: str

@app.post("/generate")
def generate_code(req: CodeRequest):
 inputs = tokenizer(req.instruction, return_tensors="pt")
 outputs = model.generate(**inputs, max_length=128)
 code = tokenizer.decode(outputs[0], skip_special_tokens=True)
 return {"code": code}

In [None]:
# Uncomment to run API
# uvicorn.run(app, host="0.0.0.0", port=8000)