File size: 13,398 Bytes

0ad01db

"""
dispatchAI Phone Proxy v2 — Clean output extraction
Instead of parsing the messy llama-cli output, we use --log-disable 
and capture only the response text between known markers.
"""
import os
import sys
import json
import subprocess
import re
import time
from http.server import HTTPServer, BaseHTTPRequestHandler

PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else ""
PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000
LLAMA_CLI = "/data/local/tmp/llama-cli"
MODEL_PATH = "/data/local/tmp/model.gguf"
ENV = os.environ.copy()
ENV["MSYS_NO_PATHCONV"] = "1"

class PhoneHandler(BaseHTTPRequestHandler):
    def do_POST(self):
        if self.path != "/v1/chat/completions":
            self.send_error(404)
            return
        
        content_length = int(self.headers.get("Content-Length", 0))
        body = self.rfile.read(content_length)
        
        try:
            req = json.loads(body)
        except:
            self.send_error(400, "Invalid JSON")
            return
        
        messages = req.get("messages", [])
        max_tokens = req.get("max_tokens", 100)
        temperature = req.get("temperature", 0.7)
        chat_format = req.get("chat_format", "chatml")
        
        # Build prompt — use SIMPLE raw completion (no chat template)
        # This avoids template markers being echoed in output
        # Just use the user's last message as the prompt
        user_message = ""
        for msg in reversed(messages):
            if msg.get("role") == "user":
                user_message = msg.get("content", "")
                break
        
        if not user_message:
            user_message = "Hello"
        
        # For chat models, prefix with a natural prompt
        if len(messages) > 1:
            # Multi-turn: include system + user
            system_msg = ""
            for msg in messages:
                if msg.get("role") == "system":
                    system_msg = msg.get("content", "")
                    break
            
            if system_msg:
                prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:"
            else:
                prompt = f"User: {user_message}\nAssistant:"
        else:
            # Single turn — just use the message directly
            # For SmolLM2/Llama, raw continuation works: "The capital of France is"
            # For chat-style, use "User: ... Assistant:"
            prompt = user_message
        
        escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\")
        
        # Run llama-cli with prompt from stdin to avoid escaping issues
        # Actually, let's use a simpler approach: write prompt to file, then use -p file
        # But llama-cli doesn't support file input. Let's use a different approach.
        
        # Use the -p flag but with careful escaping
        cmd = (
            f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
            f'timeout 60 ./llama-cli '
            f'-m {MODEL_PATH} '
            f'-p \'{escaped_prompt}\' '
            f'-n {max_tokens} '
            f'-t 4 '
            f'-st '
            f'--no-display-prompt '
            f'--log-disable 2>&1'
        )
        
        try:
            result = subprocess.run(
                ["adb", "-s", PHONE_SERIAL, "shell", cmd],
                env=ENV, capture_output=True, text=True, timeout=90
            )
            
            output = result.stdout + result.stderr
            
            # With --log-disable and --no-display-prompt, the output should be cleaner
            # But there may still be some noise. Extract just the response.
            
            # Parse speed
            gen_match = re.search(r'Generation:\s*([\d.]+)\s*t/s', output)
            prompt_match = re.search(r'Prompt:\s*([\d.]+)\s*t/s', output)
            gen_tps = float(gen_match.group(1)) if gen_match else 0
            prompt_tps = float(prompt_match.group(1)) if prompt_match else 0
            
            # Extract response: everything between the prompt marker and the stats line
            # With --no-display-prompt, the response starts right after loading
            lines = output.split('\n')
            response_lines = []
            in_response = False
            
            for line in lines:
                # Stop at stats
                if 't/s' in line or 'Exiting' in line:
                    break
                
                # Skip loading animation
                if 'Loading model' in line:
                    continue
                if 'llama_context' in line:
                    continue
                if 'llama_kv_cache' in line:
                    continue
                if 'build' in line and ':' in line:
                    continue
                if 'model' in line and ':' in line and 'dispatchAI' not in line:
                    continue
                if 'modalities' in line:
                    continue
                if 'available commands' in line:
                    continue
                if line.strip().startswith('/'):
                    continue
                if line.strip() == '>':
                    continue
                if not line.strip():
                    continue
                
                # Clean the line
                clean = line
                # Remove spinner characters
                clean = re.sub(r'[|/\\\-]', '', clean)
                # Remove backspace
                clean = clean.replace('\b', '')
                # Remove block characters (loading animation)
                clean = re.sub(r'[▄█▀▌▐▒░│║╔╗╚╝═]', '', clean)
                # Remove prompt template markers
                clean = re.sub(r'<\|[^>]+\|>', '', clean)
                clean = re.sub(r'<start_of_turn>|<end_of_turn>', '', clean)
                clean = re.sub(r'eot_id|begin_of_text|start_header_id|end_header_id', '', clean)
                # Remove leading/trailing whitespace and special chars
                clean = clean.strip(' <>|')
                # Remove leading > if present
                clean = clean.lstrip('> ').strip()
                
                if clean and len(clean) > 0:
                    response_lines.append(clean)
            
            generated_text = ' '.join(response_lines).strip()
            
            # AGGRESSIVE final cleanup — strip ALL template markers from final text
            # The model outputs markers like <eot_id>, <start_header_id>, etc (without pipes)
            import re as re_final
            # Remove ALL angle-bracket content (catches <eot_id>, <|eot_id|>, etc)
            generated_text = re_final.sub(r'<[^>]*>', '', generated_text)
            # Also remove bare markers without brackets
            generated_text = re_final.sub(r'\beot_id\b|\bbegin_of_text\b|\bstart_header_id\b|\bend_header_id\b|\bim_start\b|\bim_end\b', '', generated_text)
            # Remove "User:" and "Assistant:" echoes
            generated_text = re_final.sub(r'^(User:|Assistant:|System:)\s*', '', generated_text)
            # Collapse whitespace
            generated_text = re_final.sub(r'\s+', ' ', generated_text).strip()
            
            # If the output is just template markers (empty after cleanup), 
            # try raw completion approach
            if len(generated_text) < 5:
                # The prompt template approach failed — try raw text completion
                raw_prompt = messages[-1].get("content", "") if messages else "Hello"
                cmd_raw = (
                    f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
                    f'timeout 60 ./llama-cli '
                    f'-m {MODEL_PATH} '
                    f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' '
                    f'-n {max_tokens} '
                    f'-t 4 '
                    f'-st '
                    f'--no-display-prompt 2>&1'
                )
                result2 = subprocess.run(
                    ["adb", "-s", PHONE_SERIAL, "shell", cmd_raw],
                    env=ENV, capture_output=True, text=True, timeout=90
                )
                output2 = result2.stdout + result2.stderr
                # For raw completion, take everything before the stats line
                lines2 = output2.split('\n')
                raw_lines = []
                for line in lines2:
                    if 't/s' in line or 'Exiting' in line:
                        break
                    if 'Loading' in line or 'llama_' in line or 'build' in line:
                        continue
                    if 'available commands' in line or line.strip().startswith('/'):
                        continue
                    clean2 = re.sub(r'[|/\\\-▄█▀▌▐▒░│║╔╗╚╝═]', '', line).replace('\b', '').strip()
                    if clean2 and len(clean2) > 1 and clean2 != '>':
                        raw_lines.append(clean2)
                generated_text = ' '.join(raw_lines).strip()
            
            # If still has noise, try extracting just the part after the last prompt echo
            if len(generated_text) > 10:
                # Find the last occurrence of the user's message and take everything after
                for msg in reversed(messages):
                    content = msg.get("content", "")
                    if content and content in generated_text:
                        idx = generated_text.rfind(content)
                        after = generated_text[idx + len(content):].strip()
                        if after:
                            generated_text = after
                        break
            
            # Estimate tokens
            prompt_tokens = len(prompt) // 4
            completion_tokens = len(generated_text) // 4
            total_tokens = prompt_tokens + completion_tokens
            
            response = {
                "id": f"chatcmpl-{int(time.time())}",
                "object": "chat.completion",
                "created": int(time.time()),
                "model": req.get("model", "dispatchAI"),
                "choices": [{
                    "index": 0,
                    "message": {"role": "assistant", "content": generated_text},
                    "finish_reason": "stop",
                }],
                "usage": {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": total_tokens,
                },
                "phone_info": {
                    "serial": PHONE_SERIAL,
                    "generation_tps": gen_tps,
                    "prompt_tps": prompt_tps,
                },
            }
            
            self.send_response(200)
            self.send_header("Content-Type", "application/json")
            self.end_headers()
            self.wfile.write(json.dumps(response).encode())
            
        except subprocess.TimeoutExpired:
            self.send_error(504, "Inference timed out")
        except Exception as e:
            self.send_error(500, str(e)[:200])
    
    def do_GET(self):
        if self.path == "/health":
            self.send_response(200)
            self.send_header("Content-Type", "application/json")
            self.end_headers()
            self.wfile.write(json.dumps({
                "status": "ok", "phone": PHONE_SERIAL, "port": PORT
            }).encode())
        else:
            self.send_error(404)
    
    def build_prompt(self, messages, fmt):
        prompt = ""
        for msg in messages:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            if fmt == "llama-3":
                if role == "system":
                    prompt += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{content}<|eot_id|>"
                elif role == "user":
                    prompt += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>"
                elif role == "assistant":
                    prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>"
                prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
            elif fmt == "gemma":
                if role == "user":
                    prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
                elif role == "assistant":
                    prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
                prompt += "<start_of_turn>model\n"
            else:  # chatml
                prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
                prompt += "<|im_start|>assistant\n"
        return prompt
    
    def log_message(self, format, *args):
        pass  # Suppress logs

if __name__ == "__main__":
    if not PHONE_SERIAL:
        print("Usage: python phone_proxy_v2.py <serial> [port]")
        sys.exit(1)
    
    # Check phone
    result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV)
    if PHONE_SERIAL not in result.stdout:
        print(f"Phone {PHONE_SERIAL} not found")
        sys.exit(1)
    
    print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}")
    server = HTTPServer(("0.0.0.0", PORT), PhoneHandler)
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        server.shutdown()