File size: 13,398 Bytes
0ad01db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
"""
dispatchAI Phone Proxy v2 β€” Clean output extraction
Instead of parsing the messy llama-cli output, we use --log-disable 
and capture only the response text between known markers.
"""
import os
import sys
import json
import subprocess
import re
import time
from http.server import HTTPServer, BaseHTTPRequestHandler

PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else ""
PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000
LLAMA_CLI = "/data/local/tmp/llama-cli"
MODEL_PATH = "/data/local/tmp/model.gguf"
ENV = os.environ.copy()
ENV["MSYS_NO_PATHCONV"] = "1"

class PhoneHandler(BaseHTTPRequestHandler):
    def do_POST(self):
        if self.path != "/v1/chat/completions":
            self.send_error(404)
            return
        
        content_length = int(self.headers.get("Content-Length", 0))
        body = self.rfile.read(content_length)
        
        try:
            req = json.loads(body)
        except:
            self.send_error(400, "Invalid JSON")
            return
        
        messages = req.get("messages", [])
        max_tokens = req.get("max_tokens", 100)
        temperature = req.get("temperature", 0.7)
        chat_format = req.get("chat_format", "chatml")
        
        # Build prompt β€” use SIMPLE raw completion (no chat template)
        # This avoids template markers being echoed in output
        # Just use the user's last message as the prompt
        user_message = ""
        for msg in reversed(messages):
            if msg.get("role") == "user":
                user_message = msg.get("content", "")
                break
        
        if not user_message:
            user_message = "Hello"
        
        # For chat models, prefix with a natural prompt
        if len(messages) > 1:
            # Multi-turn: include system + user
            system_msg = ""
            for msg in messages:
                if msg.get("role") == "system":
                    system_msg = msg.get("content", "")
                    break
            
            if system_msg:
                prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:"
            else:
                prompt = f"User: {user_message}\nAssistant:"
        else:
            # Single turn β€” just use the message directly
            # For SmolLM2/Llama, raw continuation works: "The capital of France is"
            # For chat-style, use "User: ... Assistant:"
            prompt = user_message
        
        escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\")
        
        # Run llama-cli with prompt from stdin to avoid escaping issues
        # Actually, let's use a simpler approach: write prompt to file, then use -p file
        # But llama-cli doesn't support file input. Let's use a different approach.
        
        # Use the -p flag but with careful escaping
        cmd = (
            f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
            f'timeout 60 ./llama-cli '
            f'-m {MODEL_PATH} '
            f'-p \'{escaped_prompt}\' '
            f'-n {max_tokens} '
            f'-t 4 '
            f'-st '
            f'--no-display-prompt '
            f'--log-disable 2>&1'
        )
        
        try:
            result = subprocess.run(
                ["adb", "-s", PHONE_SERIAL, "shell", cmd],
                env=ENV, capture_output=True, text=True, timeout=90
            )
            
            output = result.stdout + result.stderr
            
            # With --log-disable and --no-display-prompt, the output should be cleaner
            # But there may still be some noise. Extract just the response.
            
            # Parse speed
            gen_match = re.search(r'Generation:\s*([\d.]+)\s*t/s', output)
            prompt_match = re.search(r'Prompt:\s*([\d.]+)\s*t/s', output)
            gen_tps = float(gen_match.group(1)) if gen_match else 0
            prompt_tps = float(prompt_match.group(1)) if prompt_match else 0
            
            # Extract response: everything between the prompt marker and the stats line
            # With --no-display-prompt, the response starts right after loading
            lines = output.split('\n')
            response_lines = []
            in_response = False
            
            for line in lines:
                # Stop at stats
                if 't/s' in line or 'Exiting' in line:
                    break
                
                # Skip loading animation
                if 'Loading model' in line:
                    continue
                if 'llama_context' in line:
                    continue
                if 'llama_kv_cache' in line:
                    continue
                if 'build' in line and ':' in line:
                    continue
                if 'model' in line and ':' in line and 'dispatchAI' not in line:
                    continue
                if 'modalities' in line:
                    continue
                if 'available commands' in line:
                    continue
                if line.strip().startswith('/'):
                    continue
                if line.strip() == '>':
                    continue
                if not line.strip():
                    continue
                
                # Clean the line
                clean = line
                # Remove spinner characters
                clean = re.sub(r'[|/\\\-]', '', clean)
                # Remove backspace
                clean = clean.replace('\b', '')
                # Remove block characters (loading animation)
                clean = re.sub(r'[β–„β–ˆβ–€β–Œβ–β–’β–‘β”‚β•‘β•”β•—β•šβ•β•]', '', clean)
                # Remove prompt template markers
                clean = re.sub(r'<\|[^>]+\|>', '', clean)
                clean = re.sub(r'<start_of_turn>|<end_of_turn>', '', clean)
                clean = re.sub(r'eot_id|begin_of_text|start_header_id|end_header_id', '', clean)
                # Remove leading/trailing whitespace and special chars
                clean = clean.strip(' <>|')
                # Remove leading > if present
                clean = clean.lstrip('> ').strip()
                
                if clean and len(clean) > 0:
                    response_lines.append(clean)
            
            generated_text = ' '.join(response_lines).strip()
            
            # AGGRESSIVE final cleanup β€” strip ALL template markers from final text
            # The model outputs markers like <eot_id>, <start_header_id>, etc (without pipes)
            import re as re_final
            # Remove ALL angle-bracket content (catches <eot_id>, <|eot_id|>, etc)
            generated_text = re_final.sub(r'<[^>]*>', '', generated_text)
            # Also remove bare markers without brackets
            generated_text = re_final.sub(r'\beot_id\b|\bbegin_of_text\b|\bstart_header_id\b|\bend_header_id\b|\bim_start\b|\bim_end\b', '', generated_text)
            # Remove "User:" and "Assistant:" echoes
            generated_text = re_final.sub(r'^(User:|Assistant:|System:)\s*', '', generated_text)
            # Collapse whitespace
            generated_text = re_final.sub(r'\s+', ' ', generated_text).strip()
            
            # If the output is just template markers (empty after cleanup), 
            # try raw completion approach
            if len(generated_text) < 5:
                # The prompt template approach failed β€” try raw text completion
                raw_prompt = messages[-1].get("content", "") if messages else "Hello"
                cmd_raw = (
                    f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
                    f'timeout 60 ./llama-cli '
                    f'-m {MODEL_PATH} '
                    f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' '
                    f'-n {max_tokens} '
                    f'-t 4 '
                    f'-st '
                    f'--no-display-prompt 2>&1'
                )
                result2 = subprocess.run(
                    ["adb", "-s", PHONE_SERIAL, "shell", cmd_raw],
                    env=ENV, capture_output=True, text=True, timeout=90
                )
                output2 = result2.stdout + result2.stderr
                # For raw completion, take everything before the stats line
                lines2 = output2.split('\n')
                raw_lines = []
                for line in lines2:
                    if 't/s' in line or 'Exiting' in line:
                        break
                    if 'Loading' in line or 'llama_' in line or 'build' in line:
                        continue
                    if 'available commands' in line or line.strip().startswith('/'):
                        continue
                    clean2 = re.sub(r'[|/\\\-β–„β–ˆβ–€β–Œβ–β–’β–‘β”‚β•‘β•”β•—β•šβ•β•]', '', line).replace('\b', '').strip()
                    if clean2 and len(clean2) > 1 and clean2 != '>':
                        raw_lines.append(clean2)
                generated_text = ' '.join(raw_lines).strip()
            
            # If still has noise, try extracting just the part after the last prompt echo
            if len(generated_text) > 10:
                # Find the last occurrence of the user's message and take everything after
                for msg in reversed(messages):
                    content = msg.get("content", "")
                    if content and content in generated_text:
                        idx = generated_text.rfind(content)
                        after = generated_text[idx + len(content):].strip()
                        if after:
                            generated_text = after
                        break
            
            # Estimate tokens
            prompt_tokens = len(prompt) // 4
            completion_tokens = len(generated_text) // 4
            total_tokens = prompt_tokens + completion_tokens
            
            response = {
                "id": f"chatcmpl-{int(time.time())}",
                "object": "chat.completion",
                "created": int(time.time()),
                "model": req.get("model", "dispatchAI"),
                "choices": [{
                    "index": 0,
                    "message": {"role": "assistant", "content": generated_text},
                    "finish_reason": "stop",
                }],
                "usage": {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": total_tokens,
                },
                "phone_info": {
                    "serial": PHONE_SERIAL,
                    "generation_tps": gen_tps,
                    "prompt_tps": prompt_tps,
                },
            }
            
            self.send_response(200)
            self.send_header("Content-Type", "application/json")
            self.end_headers()
            self.wfile.write(json.dumps(response).encode())
            
        except subprocess.TimeoutExpired:
            self.send_error(504, "Inference timed out")
        except Exception as e:
            self.send_error(500, str(e)[:200])
    
    def do_GET(self):
        if self.path == "/health":
            self.send_response(200)
            self.send_header("Content-Type", "application/json")
            self.end_headers()
            self.wfile.write(json.dumps({
                "status": "ok", "phone": PHONE_SERIAL, "port": PORT
            }).encode())
        else:
            self.send_error(404)
    
    def build_prompt(self, messages, fmt):
        prompt = ""
        for msg in messages:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            if fmt == "llama-3":
                if role == "system":
                    prompt += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{content}<|eot_id|>"
                elif role == "user":
                    prompt += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>"
                elif role == "assistant":
                    prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>"
                prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
            elif fmt == "gemma":
                if role == "user":
                    prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
                elif role == "assistant":
                    prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
                prompt += "<start_of_turn>model\n"
            else:  # chatml
                prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
                prompt += "<|im_start|>assistant\n"
        return prompt
    
    def log_message(self, format, *args):
        pass  # Suppress logs

if __name__ == "__main__":
    if not PHONE_SERIAL:
        print("Usage: python phone_proxy_v2.py <serial> [port]")
        sys.exit(1)
    
    # Check phone
    result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV)
    if PHONE_SERIAL not in result.stdout:
        print(f"Phone {PHONE_SERIAL} not found")
        sys.exit(1)
    
    print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}")
    server = HTTPServer(("0.0.0.0", PORT), PhoneHandler)
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        server.shutdown()