Spaces:

prashantmatlani
/

coderg

Running

App Files Files Community

prashantmatlani commited on about 7 hours ago

Commit

b83d944

1 Parent(s): 1e8c4d5

groq inference

Browse files

Files changed (3) hide show

.gitignore +0 -2
core_logic.py +17 -52
core_logic_lw.py → core_logic_hybrid.py +52 -17

.gitignore CHANGED Viewed

@@ -16,5 +16,3 @@ __pycache__/
 # OS
 .DS_Store
 Thumbs.db
-./CoderG01.docx

 # OS
 .DS_Store
 Thumbs.db

core_logic.py CHANGED Viewed

@@ -1,53 +1,20 @@
-# ./core_logic_hybrid.py -> Token-safe
-"""
-Hybrid: Local LLM with HF UI
-"Master Stroke" for sharing app while keeping compute costs at zero; with UI on Hugging Face, the app "calls home" - the local PC - for answers.
-We expose local Ollama, via the secret "LOCAL_LLM_URL" as "The Tunnel", a secure bridge between the Hugging Face-hosted UI and the local LLM. By default, Ollama only listens to localhost, so we tell it to accept external traffic from the tunnel:
-. The UI sends user messages to the Tunnel, which forwards them to the local Ollama instance
-. Ollama processes the request and sends the response back through the Tunnel to the UI."
-"""
 import os
-from openai import OpenAI
 from tools import web_search, parse_file
-# Hybrid bridge - Sanitized URL to prevent double slashes
-tunnel_url = os.getenv("LOCAL_LLM_URL", "").rstrip("/")
-client = OpenAI(
-    base_url=f"{tunnel_url}/v1",
-    api_key="ollama"
 )
-model = "gemma4:latest"
-SYSTEM_PROMPT = """
-You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
-Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
-Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
-Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
-CORE DIRECTIVES:
-1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
-2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
-3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
-4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
-5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
-PERSONALITY:
-1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
-2. HUMBLE: Apologize when mistaken.
-3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
-When a user provides files, analyze the code structure and logic before proposing changes.
-"""
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
@@ -61,7 +28,7 @@ def chat_function(message, history):
     # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
     if len(context_from_files) > 12000:
-        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
     # 2. Research Trigger
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
@@ -73,7 +40,7 @@ def chat_function(message, history):
     # 3. Build Messages with History Slicing
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # Keep last 3 turns for context stability
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
@@ -84,17 +51,15 @@ def chat_function(message, history):
             model=model,
             messages=messages,
             stream=True,
-            temperature=0.2, # Zero for architectural precision; incremented for creative architecture
-            max_tokens=1024
         )
         response_text = ""
         for chunk in completion:
-            # Check for valid delta content to avoid metadata crashes
-            if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
                 token = chunk.choices[0].delta.content
-                if token:
-                    response_text += token
-                    yield response_text
     except Exception as e:
-        yield f"Silicon Error: {str(e)}"

+# ./core_logic.py -> Token-safe
 import os
+from groq import Groq
 from tools import web_search, parse_file
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+model = "llama-3.1-8b-instant"
+# Compressed for token efficiency
+SYSTEM_PROMPT = (
+    "You're a Full-stack AI Engineering Genius. "
+    "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
+    "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
 )
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
     # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
     if len(context_from_files) > 12000:
+        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
     # 2. Research Trigger
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
     # 3. Build Messages with History Slicing
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
             model=model,
             messages=messages,
             stream=True,
+            temperature=0.0,
+            max_tokens=1024 # Limit response size to prevent mid-stream cuts
         )
         response_text = ""
         for chunk in completion:
+            if chunk.choices and chunk.choices[0].delta.content:
                 token = chunk.choices[0].delta.content
+                response_text += token
+                yield response_text
     except Exception as e:
+        yield f"Error: {str(e)}"

core_logic_lw.py → core_logic_hybrid.py RENAMED Viewed

@@ -1,20 +1,53 @@
-# ./core_logic.py -> Token-safe
 import os
-from groq import Groq
 from tools import web_search, parse_file
-client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-model = "llama-3.1-8b-instant"
-# Compressed for token efficiency
-SYSTEM_PROMPT = (
-    "You're a Full-stack AI Engineering Genius. "
-    "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
-    "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
 )
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
@@ -28,7 +61,7 @@ def chat_function(message, history):
     # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
     if len(context_from_files) > 12000:
-        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
     # 2. Research Trigger
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
@@ -40,7 +73,7 @@ def chat_function(message, history):
     # 3. Build Messages with History Slicing
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
@@ -51,15 +84,17 @@ def chat_function(message, history):
             model=model,
             messages=messages,
             stream=True,
-            temperature=0.0,
-            max_tokens=1024 # Limit response size to prevent mid-stream cuts
         )
         response_text = ""
         for chunk in completion:
-            if chunk.choices and chunk.choices[0].delta.content:
                 token = chunk.choices[0].delta.content
-                response_text += token
-                yield response_text
     except Exception as e:
-        yield f"Error: {str(e)}"

+# ./core_logic_hybrid.py -> Token-safe
+"""
+Hybrid: Local LLM with HF UI
+"Master Stroke" for sharing app while keeping compute costs at zero; with UI on Hugging Face, the app "calls home" - the local PC - for answers.
+We expose local Ollama, via the secret "LOCAL_LLM_URL" as "The Tunnel", a secure bridge between the Hugging Face-hosted UI and the local LLM. By default, Ollama only listens to localhost, so we tell it to accept external traffic from the tunnel:
+. The UI sends user messages to the Tunnel, which forwards them to the local Ollama instance
+. Ollama processes the request and sends the response back through the Tunnel to the UI."
+"""
 import os
+from openai import OpenAI
 from tools import web_search, parse_file
+# Hybrid bridge - Sanitized URL to prevent double slashes
+tunnel_url = os.getenv("LOCAL_LLM_URL", "").rstrip("/")
+client = OpenAI(
+    base_url=f"{tunnel_url}/v1",
+    api_key="ollama"
 )
+model = "gemma4:latest"
+SYSTEM_PROMPT = """
+You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
+Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
+Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
+Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
+CORE DIRECTIVES:
+1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
+2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
+3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
+4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
+5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
+PERSONALITY:
+1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
+2. HUMBLE: Apologize when mistaken.
+3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
+When a user provides files, analyze the code structure and logic before proposing changes.
+"""
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
     # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
     if len(context_from_files) > 12000:
+        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
     # 2. Research Trigger
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
     # 3. Build Messages with History Slicing
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # Keep last 3 turns for context stability
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
             model=model,
             messages=messages,
             stream=True,
+            temperature=0.2, # Zero for architectural precision; incremented for creative architecture
+            max_tokens=1024
         )
         response_text = ""
         for chunk in completion:
+            # Check for valid delta content to avoid metadata crashes
+            if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
                 token = chunk.choices[0].delta.content
+                if token:
+                    response_text += token
+                    yield response_text
     except Exception as e:
+        yield f"Silicon Error: {str(e)}"