Add LFM2-350M MNN model files and inference script

Files changed (8) hide show

config.json +22 -0
export_args.json +42 -0
inference.py +188 -0
llm.mnn +3 -0
llm.mnn.json +0 -0
llm.mnn.weight +3 -0
llm_config.json +20 -0
tokenizer.mtok +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "llm_model": "llm.mnn",
+    "llm_weight": "llm.mnn.weight",
+    "backend_type": "cpu",
+    "thread_num": 4,
+    "precision": "low",
+    "memory": "low",
+    "sampler_type": "mixed",
+    "temperature": 0.8,
+    "top_k": 40,
+    "top_p": 0.9,
+    "min_p": 0.05,
+    "tfs_z": 1.0,
+    "typical": 0.95,
+    "repetition_penalty": 1.0,
+    "presence_penalty": 0.0,
+    "frequency_penalty": 0.0,
+    "penalty_window": 0,
+    "n_gram": 8,
+    "ngram_factor": 1.0,
+    "tokenizer_file": "tokenizer.mtok"
+}

export_args.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+    "path": "../../../../",
+    "type": null,
+    "tokenizer_path": "../../../../",
+    "eagle_path": null,
+    "lora_path": null,
+    "gptq_path": null,
+    "dst_path": "./model",
+    "verbose": false,
+    "test": null,
+    "export": "mnn",
+    "onnx_slim": false,
+    "quant_bit": 4,
+    "quant_block": 64,
+    "visual_quant_bit": null,
+    "visual_quant_block": null,
+    "lm_quant_bit": 4,
+    "lm_quant_block": 64,
+    "mnnconvert": "../../../build/MNNConvert",
+    "ppl": false,
+    "awq": false,
+    "hqq": true,
+    "omni": false,
+    "transformer_fuse": false,
+    "group_conv_native": false,
+    "smooth": false,
+    "sym": false,
+    "visual_sym": false,
+    "seperate_embed": false,
+    "lora_split": false,
+    "calib_data": null,
+    "act_bit": 16,
+    "embed_bit": 16,
+    "act_sym": false,
+    "quant_config": null,
+    "generate_for_npu": false,
+    "skip_weight": false,
+    "omni_epochs": 20,
+    "omni_lr": 0.005,
+    "omni_wd": 0.0001,
+    "tie_word_embeddings": true
+}

inference.py ADDED Viewed

	@@ -0,0 +1,188 @@

+#!/usr/bin/env python3
+"""MNN LLM Inference & Benchmark script for LFM2-350M model."""
+import sys
+import os
+import time
+import argparse
+import MNN.llm as llm
+def run_inference(model, prompt, stream=False):
+    """Run a single inference and return the response + timing context."""
+    model.reset()
+    response = model.response(prompt, stream)
+    if stream:
+        output = ""
+        for chunk in response:
+            print(chunk, end="", flush=True)
+            output += chunk
+        print()
+        return output
+    return response
+def benchmark(model, prompts, warmup=1, runs=3):
+    """Benchmark prefill and decode performance across multiple prompts."""
+    print("=" * 60)
+    print("BENCHMARK")
+    print("=" * 60)
+    # Warmup
+    print(f"\nWarmup ({warmup} run(s))...")
+    for i in range(warmup):
+        model.reset()
+        model.response(prompts[0], False)
+    results = []
+    for idx, prompt in enumerate(prompts):
+        prompt_results = []
+        for run in range(runs):
+            model.reset()
+            t0 = time.perf_counter()
+            response = model.response(prompt, False)
+            t1 = time.perf_counter()
+            wall_time = t1 - t0
+            ctx = model.context
+            ctx.refresh()
+            prompt_tokens = ctx.prompt_len
+            gen_tokens = ctx.gen_seq_len
+            prefill_us = ctx.prefill_us
+            decode_us = ctx.decode_us
+            prefill_s = prefill_us / 1e6 if prefill_us else 0
+            decode_s = decode_us / 1e6 if decode_us else 0
+            prefill_tps = prompt_tokens / prefill_s if prefill_s > 0 else 0
+            decode_tps = gen_tokens / decode_s if decode_s > 0 else 0
+            prompt_results.append({
+                "prompt_tokens": prompt_tokens,
+                "gen_tokens": gen_tokens,
+                "wall_time": wall_time,
+                "prefill_s": prefill_s,
+                "decode_s": decode_s,
+                "prefill_tps": prefill_tps,
+                "decode_tps": decode_tps,
+                "response": response,
+            })
+        results.append(prompt_results)
+        # Print per-prompt summary
+        avg_prefill_tps = sum(r["prefill_tps"] for r in prompt_results) / runs
+        avg_decode_tps = sum(r["decode_tps"] for r in prompt_results) / runs
+        avg_wall = sum(r["wall_time"] for r in prompt_results) / runs
+        prompt_tokens = prompt_results[0]["prompt_tokens"]
+        avg_gen = sum(r["gen_tokens"] for r in prompt_results) / runs
+        print(f"\nPrompt {idx + 1}: \"{prompt[:60]}{'...' if len(prompt) > 60 else ''}\"")
+        print(f"  Prompt tokens  : {prompt_tokens}")
+        print(f"  Avg gen tokens : {avg_gen:.1f}")
+        print(f"  Avg wall time  : {avg_wall:.3f} s")
+        print(f"  Avg prefill    : {avg_prefill_tps:.1f} tok/s")
+        print(f"  Avg decode     : {avg_decode_tps:.1f} tok/s")
+    # Overall summary
+    all_runs = [r for pr in results for r in pr]
+    overall_prefill = sum(r["prefill_tps"] for r in all_runs) / len(all_runs)
+    overall_decode = sum(r["decode_tps"] for r in all_runs) / len(all_runs)
+    print("\n" + "=" * 60)
+    print(f"Overall avg prefill : {overall_prefill:.1f} tok/s")
+    print(f"Overall avg decode  : {overall_decode:.1f} tok/s")
+    print("=" * 60)
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="MNN LLM Inference & Benchmark")
+    parser.add_argument("--config", default="config.json",
+                        help="Path to MNN config.json (default: config.json)")
+    parser.add_argument("--prompt", default=None,
+                        help="Single prompt for inference")
+    parser.add_argument("--stream", action="store_true",
+                        help="Stream output tokens")
+    parser.add_argument("--benchmark", action="store_true",
+                        help="Run benchmark suite")
+    parser.add_argument("--warmup", type=int, default=1,
+                        help="Warmup runs for benchmark (default: 1)")
+    parser.add_argument("--runs", type=int, default=3,
+                        help="Benchmark runs per prompt (default: 3)")
+    parser.add_argument("--backend", default=None,
+                        choices=["cpu", "metal"],
+                        help="Override backend type")
+    parser.add_argument("--threads", type=int, default=None,
+                        help="Override thread count")
+    parser.add_argument("--max-tokens", type=int, default=128,
+                        help="Max tokens to generate (default: 128)")
+    args = parser.parse_args()
+    model_dir = os.path.dirname(os.path.abspath(args.config))
+    config_path = os.path.abspath(args.config)
+    print(f"Loading model from: {config_path}")
+    model = llm.create(config_path)
+    if args.backend:
+        model.set_config({"backend_type": args.backend})
+    if args.threads:
+        model.set_config({"thread_num": args.threads})
+    model.set_config({"max_new_tokens": args.max_tokens})
+    model.load()
+    print("Model loaded.\n")
+    if args.benchmark:
+        bench_prompts = [
+            "Hello!",
+            "What is the capital of France?",
+            "Explain quantum computing in simple terms.",
+            "Write a short poem about the ocean.",
+            "List 5 programming languages and their main use cases.",
+        ]
+        benchmark(model, bench_prompts, warmup=args.warmup, runs=args.runs)
+    elif args.prompt:
+        print(f"Prompt: {args.prompt}\n")
+        response = run_inference(model, args.prompt, stream=args.stream)
+        if not args.stream:
+            print(f"Response:\n{response}")
+        ctx = model.context
+        ctx.refresh()
+        print(f"\n--- Stats ---")
+        print(f"Prompt tokens : {ctx.prompt_len}")
+        print(f"Gen tokens    : {ctx.gen_seq_len}")
+        prefill_s = ctx.prefill_us / 1e6 if ctx.prefill_us else 0
+        decode_s = ctx.decode_us / 1e6 if ctx.decode_us else 0
+        if prefill_s > 0:
+            print(f"Prefill       : {ctx.prompt_len / prefill_s:.1f} tok/s ({prefill_s:.3f}s)")
+        if decode_s > 0:
+            print(f"Decode        : {ctx.gen_seq_len / decode_s:.1f} tok/s ({decode_s:.3f}s)")
+    else:
+        # Interactive mode
+        print("Interactive mode (type 'quit' to exit)\n")
+        while True:
+            try:
+                user_input = input("You: ").strip()
+            except (EOFError, KeyboardInterrupt):
+                print("\nBye!")
+                break
+            if user_input.lower() in ("quit", "exit"):
+                break
+            if not user_input:
+                continue
+            response = run_inference(model, user_input, stream=True)
+            ctx = model.context
+            ctx.refresh()
+            prefill_s = ctx.prefill_us / 1e6 if ctx.prefill_us else 0
+            decode_s = ctx.decode_us / 1e6 if ctx.decode_us else 0
+            if decode_s > 0:
+                print(f"  [{ctx.gen_seq_len} tokens, {ctx.gen_seq_len / decode_s:.1f} tok/s]")
+if __name__ == "__main__":
+    main()

llm.mnn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:783e5af16be0b4a7bb58e626ff36cb8d496d209d1f484c4ef414619dc1bff749
+size 329296

llm.mnn.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llm.mnn.weight ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efaf91b109d51dab379ebc37be735f5b7173babb37cb9af65268805eb825cce9
+size 221790210

llm_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "model_type": "lfm2",
+    "hidden_size": 1024,
+    "layer_nums": 16,
+    "attention_mask": "float",
+    "attention_type": "full",
+    "is_mrope": false,
+    "jinja": {
+        "chat_template": "{{- bos_token -}}{%- set system_prompt = \"\" -%}{%- set ns = namespace(system_prompt=\"\") -%}{%- if messages[0][\"role\"] == \"system\" -%} {%- set ns.system_prompt = messages[0][\"content\"] -%} {%- set messages = messages[1:] -%}{%- endif -%}{%- if tools -%} {%- set ns.system_prompt = ns.system_prompt + (\"\n\" if ns.system_prompt else \"\") + \"List of tools: <|tool_list_start|>[\" -%} {%- for tool in tools -%} {%- if tool is not string -%} {%- set tool = tool | tojson -%} {%- endif -%} {%- set ns.system_prompt = ns.system_prompt + tool -%} {%- if not loop.last -%} {%- set ns.system_prompt = ns.system_prompt + \", \" -%} {%- endif -%} {%- endfor -%} {%- set ns.system_prompt = ns.system_prompt + \"]<|tool_list_end|>\" -%}{%- endif -%}{%- if ns.system_prompt -%} {{- \"<|im_start|>system\n\" + ns.system_prompt + \"<|im_end|>\n\" -}}{%- endif -%}{%- for message in messages -%} {{- \"<|im_start|>\" + message[\"role\"] + \"\n\" -}} {%- set content = message[\"content\"] -%} {%- if content is not string -%} {%- set content = content | tojson -%} {%- endif -%} {%- if message[\"role\"] == \"tool\" -%} {%- set content = \"<|tool_response_start|>\" + content + \"<|tool_response_end|>\" -%} {%- endif -%} {{- content + \"<|im_end|>\n\" -}}{%- endfor -%}{%- if add_generation_prompt -%} {{- \"<|im_start|>assistant\n\" -}}{%- endif -%}",
+        "bos": "<|startoftext|>",
+        "eos": "<|im_end|>"
+    },
+    "tie_embeddings": [
+        179847170,
+        213401602,
+        8388608,
+        4,
+        64
+    ]
+}

tokenizer.mtok ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:780f1e05d303486716c28bb13c7f2897383f59a3d4aa6bd02f15dda4301a0389
+size 1670119