FIM training over deepseek-coder-1.3B using a mojo-code dataset. This is an alpha version. It is trained only for FIM co-pilot style usage. later versions should have Q&A added as well as better performance. please leave your comments to help improve it. the recipe for this was based on this template from https://huggingface.co/blog/personal-copilot
tokenizer = AutoTokenizer.from_pretrained(merged_model_path,trust_remote_code=True,use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
merged_model_path,
device_map={"": 0},
use_cache=True,
trust_remote_code=True,
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16
)
input_text = """<|fim▁begin|>
from algorithm import parallelize, vectorize
from benchmark import Benchmark
from complex import ComplexSIMD, ComplexFloat64
from math import iota
from os import env
from python import Python
from python.object import PythonObject
from runtime.llcl import num_cores, Runtime
from tensor import Tensor
from utils.index import Index
alias float_type = DType.float64
alias simd_width = simdwidthof[float_type]()
alias width = 960
alias height = 960
alias MAX_ITERS = 200
alias min_x = -2.0
alias max_x = 0.6
alias min_y = -1.5
alias max_y = 1.5
fn mandelbrot_kernel_SIMD[
simd_width: Int
](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
let cx = c.re
let cy = c.im
var x = SIMD[float_type, simd_width](0)
var y = SIMD[float_type, simd_width](0)
var y2 = SIMD[float_type, simd_width](0)
var iters = SIMD[float_type, simd_width](0)
var t: SIMD[DType.bool, simd_width] = True
for i in range(MAX_ITERS):
if not t.reduce_or():
break
y2 = y*y
y = x.fma(y + y, cy)
t = x.fma(x, y2) <= 4
x = x.fma(x, cx - y2)
iters = t.select(iters + 1, iters)
return iters
fn compare():
let t = Tensor[float_type](height, width)
@parameter
fn worker(row: Int):
let scale_x = (max_x - min_x) / width
let scale_y = (max_y - min_y) / height
<|fim▁hole|>
fn main():
compare()
<|fim▁end|>"""
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=547+200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True)[len(input_text):])
def stream(user_prompt):
runtimeFlag = "cuda:0"
inputs = tokenizer([user_prompt], return_tensors="pt").to(runtimeFlag)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=200)
stream(input_text)
also try to use an inference endpoint and use a VS-Code extension
- Downloads last month
- 45
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.