prashantmatlani commited on
Commit
b83d944
·
1 Parent(s): 1e8c4d5

groq inference

Browse files
.gitignore CHANGED
@@ -16,5 +16,3 @@ __pycache__/
16
  # OS
17
  .DS_Store
18
  Thumbs.db
19
-
20
- ./CoderG01.docx
 
16
  # OS
17
  .DS_Store
18
  Thumbs.db
 
 
core_logic.py CHANGED
@@ -1,53 +1,20 @@
1
 
2
- # ./core_logic_hybrid.py -> Token-safe
3
-
4
- """
5
-
6
- Hybrid: Local LLM with HF UI
7
-
8
- "Master Stroke" for sharing app while keeping compute costs at zero; with UI on Hugging Face, the app "calls home" - the local PC - for answers.
9
-
10
- We expose local Ollama, via the secret "LOCAL_LLM_URL" as "The Tunnel", a secure bridge between the Hugging Face-hosted UI and the local LLM. By default, Ollama only listens to localhost, so we tell it to accept external traffic from the tunnel:
11
- . The UI sends user messages to the Tunnel, which forwards them to the local Ollama instance
12
- . Ollama processes the request and sends the response back through the Tunnel to the UI."
13
- """
14
 
15
  import os
16
- from openai import OpenAI
17
  from tools import web_search, parse_file
18
 
19
- # Hybrid bridge - Sanitized URL to prevent double slashes
20
- tunnel_url = os.getenv("LOCAL_LLM_URL", "").rstrip("/")
21
 
22
- client = OpenAI(
23
- base_url=f"{tunnel_url}/v1",
24
- api_key="ollama"
 
 
25
  )
26
 
27
- model = "gemma4:latest"
28
-
29
- SYSTEM_PROMPT = """
30
- You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
31
- Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
32
-
33
- Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
34
- Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
35
-
36
- CORE DIRECTIVES:
37
- 1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
38
- 2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
39
- 3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
40
- 4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
41
- 5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
42
-
43
- PERSONALITY:
44
- 1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
45
- 2. HUMBLE: Apologize when mistaken.
46
- 3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
47
-
48
- When a user provides files, analyze the code structure and logic before proposing changes.
49
- """
50
-
51
  def chat_function(message, history):
52
  user_text = message.get("text", "")
53
  files = message.get("files", [])
@@ -61,7 +28,7 @@ def chat_function(message, history):
61
 
62
  # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
63
  if len(context_from_files) > 12000:
64
- context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
65
 
66
  # 2. Research Trigger
67
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
@@ -73,7 +40,7 @@ def chat_function(message, history):
73
  # 3. Build Messages with History Slicing
74
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
75
 
76
- # Keep last 3 turns for context stability
77
  for turn in history[-3:]:
78
  messages.append({"role": turn["role"], "content": turn["content"]})
79
 
@@ -84,17 +51,15 @@ def chat_function(message, history):
84
  model=model,
85
  messages=messages,
86
  stream=True,
87
- temperature=0.2, # Zero for architectural precision; incremented for creative architecture
88
- max_tokens=1024
89
  )
90
 
91
  response_text = ""
92
  for chunk in completion:
93
- # Check for valid delta content to avoid metadata crashes
94
- if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
95
  token = chunk.choices[0].delta.content
96
- if token:
97
- response_text += token
98
- yield response_text
99
  except Exception as e:
100
- yield f"Silicon Error: {str(e)}"
 
1
 
2
+ # ./core_logic.py -> Token-safe
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import os
5
+ from groq import Groq
6
  from tools import web_search, parse_file
7
 
8
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
9
+ model = "llama-3.1-8b-instant"
10
 
11
+ # Compressed for token efficiency
12
+ SYSTEM_PROMPT = (
13
+ "You're a Full-stack AI Engineering Genius. "
14
+ "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
15
+ "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def chat_function(message, history):
19
  user_text = message.get("text", "")
20
  files = message.get("files", [])
 
28
 
29
  # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
30
  if len(context_from_files) > 12000:
31
+ context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
32
 
33
  # 2. Research Trigger
34
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
 
40
  # 3. Build Messages with History Slicing
41
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
42
 
43
+ # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
44
  for turn in history[-3:]:
45
  messages.append({"role": turn["role"], "content": turn["content"]})
46
 
 
51
  model=model,
52
  messages=messages,
53
  stream=True,
54
+ temperature=0.0,
55
+ max_tokens=1024 # Limit response size to prevent mid-stream cuts
56
  )
57
 
58
  response_text = ""
59
  for chunk in completion:
60
+ if chunk.choices and chunk.choices[0].delta.content:
 
61
  token = chunk.choices[0].delta.content
62
+ response_text += token
63
+ yield response_text
 
64
  except Exception as e:
65
+ yield f"Error: {str(e)}"
core_logic_lw.py → core_logic_hybrid.py RENAMED
@@ -1,20 +1,53 @@
1
 
2
- # ./core_logic.py -> Token-safe
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import os
5
- from groq import Groq
6
  from tools import web_search, parse_file
7
 
8
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
9
- model = "llama-3.1-8b-instant"
10
 
11
- # Compressed for token efficiency
12
- SYSTEM_PROMPT = (
13
- "You're a Full-stack AI Engineering Genius. "
14
- "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
15
- "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def chat_function(message, history):
19
  user_text = message.get("text", "")
20
  files = message.get("files", [])
@@ -28,7 +61,7 @@ def chat_function(message, history):
28
 
29
  # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
30
  if len(context_from_files) > 12000:
31
- context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
32
 
33
  # 2. Research Trigger
34
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
@@ -40,7 +73,7 @@ def chat_function(message, history):
40
  # 3. Build Messages with History Slicing
41
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
42
 
43
- # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
44
  for turn in history[-3:]:
45
  messages.append({"role": turn["role"], "content": turn["content"]})
46
 
@@ -51,15 +84,17 @@ def chat_function(message, history):
51
  model=model,
52
  messages=messages,
53
  stream=True,
54
- temperature=0.0,
55
- max_tokens=1024 # Limit response size to prevent mid-stream cuts
56
  )
57
 
58
  response_text = ""
59
  for chunk in completion:
60
- if chunk.choices and chunk.choices[0].delta.content:
 
61
  token = chunk.choices[0].delta.content
62
- response_text += token
63
- yield response_text
 
64
  except Exception as e:
65
- yield f"Error: {str(e)}"
 
1
 
2
+ # ./core_logic_hybrid.py -> Token-safe
3
+
4
+ """
5
+
6
+ Hybrid: Local LLM with HF UI
7
+
8
+ "Master Stroke" for sharing app while keeping compute costs at zero; with UI on Hugging Face, the app "calls home" - the local PC - for answers.
9
+
10
+ We expose local Ollama, via the secret "LOCAL_LLM_URL" as "The Tunnel", a secure bridge between the Hugging Face-hosted UI and the local LLM. By default, Ollama only listens to localhost, so we tell it to accept external traffic from the tunnel:
11
+ . The UI sends user messages to the Tunnel, which forwards them to the local Ollama instance
12
+ . Ollama processes the request and sends the response back through the Tunnel to the UI."
13
+ """
14
 
15
  import os
16
+ from openai import OpenAI
17
  from tools import web_search, parse_file
18
 
19
+ # Hybrid bridge - Sanitized URL to prevent double slashes
20
+ tunnel_url = os.getenv("LOCAL_LLM_URL", "").rstrip("/")
21
 
22
+ client = OpenAI(
23
+ base_url=f"{tunnel_url}/v1",
24
+ api_key="ollama"
 
 
25
  )
26
 
27
+ model = "gemma4:latest"
28
+
29
+ SYSTEM_PROMPT = """
30
+ You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
31
+ Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
32
+
33
+ Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
34
+ Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
35
+
36
+ CORE DIRECTIVES:
37
+ 1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
38
+ 2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
39
+ 3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
40
+ 4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
41
+ 5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
42
+
43
+ PERSONALITY:
44
+ 1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
45
+ 2. HUMBLE: Apologize when mistaken.
46
+ 3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
47
+
48
+ When a user provides files, analyze the code structure and logic before proposing changes.
49
+ """
50
+
51
  def chat_function(message, history):
52
  user_text = message.get("text", "")
53
  files = message.get("files", [])
 
61
 
62
  # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
63
  if len(context_from_files) > 12000:
64
+ context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
65
 
66
  # 2. Research Trigger
67
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
 
73
  # 3. Build Messages with History Slicing
74
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
75
 
76
+ # Keep last 3 turns for context stability
77
  for turn in history[-3:]:
78
  messages.append({"role": turn["role"], "content": turn["content"]})
79
 
 
84
  model=model,
85
  messages=messages,
86
  stream=True,
87
+ temperature=0.2, # Zero for architectural precision; incremented for creative architecture
88
+ max_tokens=1024
89
  )
90
 
91
  response_text = ""
92
  for chunk in completion:
93
+ # Check for valid delta content to avoid metadata crashes
94
+ if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
95
  token = chunk.choices[0].delta.content
96
+ if token:
97
+ response_text += token
98
+ yield response_text
99
  except Exception as e:
100
+ yield f"Silicon Error: {str(e)}"