kshitijthakkar commited on
Commit
1a4f599
·
1 Parent(s): c6f336e

initial working code

Browse files
Files changed (6) hide show
  1. .gitignore +187 -0
  2. Dockerfile +33 -0
  3. app.py +611 -0
  4. enhanced_app.py +745 -0
  5. model_handler.py +434 -0
  6. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+ .sh
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
175
+
176
+ .xml
177
+ .png
178
+ .pdf
179
+ .pptx
180
+ .zip
181
+ .log
182
+ .gradio
183
+ .idea
184
+ *conversation_logs/
185
+ push_to_hub.sh
186
+ init_repos.sh
187
+ generated_images/
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for a Python application with user permissions
2
+ FROM python:3.11-slim
3
+
4
+ # Install system dependencies as root
5
+ RUN apt-get update && apt-get install -y build-essential && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
6
+
7
+ # Create user and set up directory structure as root
8
+ RUN useradd -m -u 1000 user && \
9
+ mkdir -p /app && \
10
+ chown -R user:user /app
11
+
12
+ # Set working directory
13
+ WORKDIR /app
14
+
15
+ # Switch to user AFTER setting up permissions
16
+ USER user
17
+ ENV PATH="/home/user/.local/bin:$PATH"
18
+
19
+ # Copy files with proper ownership
20
+ COPY --chown=user:user . /app
21
+
22
+ # Install Python dependencies
23
+ COPY --chown=user:user ./requirements.txt requirements.txt
24
+ RUN pip install --no-cache-dir --upgrade pip && \
25
+ pip install --no-cache-dir --user -r requirements.txt
26
+
27
+ # Make start.sh executable
28
+ RUN chmod +x run.sh
29
+ EXPOSE 8000 7860
30
+ # Run the startup script
31
+ #CMD ["sh", "-c", "bash run.sh"]
32
+ #CMD bash -c "python /app/mcp_server.py & sleep 60 && python /app/app.py"
33
+ CMD bash -c "python /app/enhanced_app.py"
app.py ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from datasets import load_dataset
4
+ import plotly.graph_objects as go
5
+ import datetime
6
+ import json
7
+ import random
8
+ import os
9
+ from model_handler import generate_response, get_inference_configs
10
+ import torch
11
+
12
+ # Configuration for datasets
13
+ DATASET_CONFIGS = {
14
+ 'Loggenix Synthetic AI Tasks Eval (with outputs)': {
15
+ 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs',
16
+ 'split': 'train'
17
+ },
18
+ 'Loggenix Synthetic AI Tasks Eval (with outputs) v5': {
19
+ 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs',
20
+ 'split': 'train'
21
+ }
22
+ }
23
+
24
+
25
+ # Load main dataset for inference tab
26
+ def load_inference_dataset():
27
+ """Load the main dataset for inference use case"""
28
+ try:
29
+ print("Loading synthetic-ai-tasks-eval-v5 dataset...")
30
+ dataset = load_dataset(
31
+ 'kshitijthakkar/synthetic-ai-tasks-eval-v5',
32
+ split='train',
33
+ trust_remote_code=True
34
+ )
35
+ df = dataset.to_pandas()
36
+ print(f"✓ Successfully loaded: {len(df)} rows, {len(df.columns)} columns")
37
+ return df
38
+ except Exception as e:
39
+ print(f"✗ Error loading dataset: {str(e)}")
40
+ return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']})
41
+
42
+
43
+ # Load dataset for eval samples tab
44
+ def load_eval_datasets():
45
+ """Load all datasets for evaluation samples"""
46
+ datasets = {}
47
+ for display_name, config in DATASET_CONFIGS.items():
48
+ try:
49
+ print(f"Loading {display_name}...")
50
+ dataset = load_dataset(
51
+ config['repo_id'],
52
+ split=config['split'],
53
+ trust_remote_code=True
54
+ )
55
+ df = dataset.to_pandas()
56
+ datasets[display_name] = df
57
+ print(f"✓ Successfully loaded {display_name}: {len(df)} rows")
58
+ except Exception as e:
59
+ print(f"✗ Error loading {display_name}: {str(e)}")
60
+ datasets[display_name] = pd.DataFrame({
61
+ 'Error': [f'Failed to load: {str(e)}'],
62
+ 'Dataset': [config['repo_id']]
63
+ })
64
+ return datasets
65
+
66
+
67
+ # Load datasets
68
+ INFERENCE_DATASET = load_inference_dataset()
69
+ EVAL_DATASETS = load_eval_datasets()
70
+
71
+
72
+ # ===== TAB 1: INFERENCE USE CASE =====
73
+
74
+ def get_task_types():
75
+ """Get unique task types from inference dataset"""
76
+ if 'task_type' in INFERENCE_DATASET.columns:
77
+ task_types = INFERENCE_DATASET['task_type'].unique().tolist()
78
+ return [str(t) for t in task_types if pd.notna(t)]
79
+ return ["No task types available"]
80
+
81
+
82
+ def get_task_by_type(task_type):
83
+ """Get task content by task type"""
84
+ if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns:
85
+ filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type]
86
+ if len(filtered) > 0:
87
+ return str(filtered.iloc[0]['task'])
88
+ return "No task found for this type"
89
+
90
+
91
+ def run_inference(task_type, system_prompt, user_input, inference_config):
92
+ """Run model inference"""
93
+ if not user_input.strip():
94
+ return "Please enter a user input"
95
+
96
+ if not system_prompt.strip():
97
+ return "Please select a task type to load system prompt"
98
+
99
+ try:
100
+ # Get inference configuration
101
+ configs = get_inference_configs()
102
+ config = configs.get(inference_config, configs["Optimized for Speed"])
103
+
104
+ # Run inference
105
+ response = generate_response(
106
+ system_prompt=system_prompt,
107
+ user_input=user_input,
108
+ config_name=inference_config
109
+ )
110
+ return response
111
+ except Exception as e:
112
+ return f"Error during inference: {str(e)}"
113
+
114
+
115
+ # ===== TAB 2: EVAL SAMPLES =====
116
+
117
+ def update_eval_table(dataset_name):
118
+ """Update eval table based on selected dataset"""
119
+ if dataset_name in EVAL_DATASETS:
120
+ return EVAL_DATASETS[dataset_name].head(100)
121
+ return pd.DataFrame()
122
+
123
+
124
+ def get_eval_dataset_info(dataset_name):
125
+ """Get info about selected eval dataset"""
126
+ if dataset_name in EVAL_DATASETS:
127
+ df = EVAL_DATASETS[dataset_name]
128
+ return f"""
129
+ **Dataset**: {dataset_name}
130
+ - **Rows**: {len(df):,}
131
+ - **Columns**: {len(df.columns)}
132
+ - **Column Names**: {', '.join(df.columns.tolist())}
133
+ """
134
+ return "No dataset selected"
135
+
136
+
137
+ # ===== TAB 3 & 4: FLAGGING FUNCTIONALITY =====
138
+
139
+ def generate_chart():
140
+ """Generate a sample Plotly chart"""
141
+ x = list(range(10))
142
+ y = [random.randint(1, 100) for _ in x]
143
+ fig = go.Figure()
144
+ fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Random Data"))
145
+ fig.update_layout(title="Sample Chart", xaxis_title="X-axis", yaxis_title="Y-axis")
146
+ return fig.to_html(full_html=False)
147
+
148
+
149
+ def chat_interface(prompt, history):
150
+ """Handle chat interface with history"""
151
+ if not prompt.strip():
152
+ return history, ""
153
+
154
+ history.append(("You", prompt))
155
+
156
+ try:
157
+ if "chart" in prompt.lower() or "graph" in prompt.lower():
158
+ response = generate_chart()
159
+ else:
160
+ response = f"This is a demo response to: {prompt}"
161
+
162
+ if isinstance(response, str):
163
+ formatted_response = f"**AI Assistant:**\n{response}"
164
+ history.append(("AI Assistant", formatted_response))
165
+ else:
166
+ history.append(("AI Assistant", response))
167
+ except Exception as e:
168
+ error_msg = f"**AI Assistant:**\nSorry, an error occurred: {str(e)}"
169
+ history.append(("AI Assistant", error_msg))
170
+
171
+ return history, ""
172
+
173
+
174
+ def flag_response(history, flagged_message, flag_reason):
175
+ """Flag a response"""
176
+ if not flagged_message or flagged_message == "No responses available":
177
+ return "Invalid message selection."
178
+
179
+ try:
180
+ flagged_index = int(flagged_message.split()[1][:-1])
181
+ if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant":
182
+ return "You can only flag assistant responses."
183
+
184
+ flagged_message_content = history[flagged_index][1]
185
+
186
+ log_entry = {
187
+ "timestamp": datetime.datetime.now().isoformat(),
188
+ "flag_reason": str(flag_reason),
189
+ "flagged_message": str(flagged_message_content),
190
+ "conversation_context": history,
191
+ }
192
+
193
+ os.makedirs("logs", exist_ok=True)
194
+ with open("logs/flagged_responses.log", "a") as f:
195
+ f.write(json.dumps(log_entry) + "\n")
196
+
197
+ return f"Response flagged successfully"
198
+ except Exception as e:
199
+ return f"Error flagging response: {str(e)}"
200
+
201
+
202
+ def get_assistant_responses(history):
203
+ """Get dropdown options for assistant responses"""
204
+ responses = [
205
+ f"Response {i}: {str(msg[1])[:50]}..."
206
+ for i, msg in enumerate(history)
207
+ if msg[0] == "AI Assistant"
208
+ ]
209
+
210
+ if not responses:
211
+ responses = ["No responses available"]
212
+
213
+ return gr.update(choices=responses, value=responses[0])
214
+
215
+
216
+ def display_selected_message(selected_index, history):
217
+ """Display the selected flagged message"""
218
+ if selected_index == "No responses available":
219
+ return "No responses available"
220
+
221
+ try:
222
+ flagged_index = int(selected_index.split()[1][:-1])
223
+ if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant":
224
+ return history[flagged_index][1]
225
+ else:
226
+ return "Invalid selection."
227
+ except Exception as e:
228
+ return f"Error: {str(e)}"
229
+
230
+
231
+ def read_flagged_messages():
232
+ """Read flagged messages from log file"""
233
+ try:
234
+ if not os.path.exists("logs/flagged_responses.log"):
235
+ return pd.DataFrame()
236
+
237
+ with open("logs/flagged_responses.log", "r") as f:
238
+ flagged_messages = f.readlines()
239
+
240
+ if not flagged_messages:
241
+ return pd.DataFrame()
242
+
243
+ table_data = []
244
+ for entry in flagged_messages:
245
+ data = json.loads(entry)
246
+ table_data.append({
247
+ "Timestamp": data.get("timestamp", "N/A"),
248
+ "Flag Reason": data.get("flag_reason", "N/A"),
249
+ "Flagged Message": data.get("flagged_message", "N/A")[:100] + "...",
250
+ "Conversation Context": str(len(data.get("conversation_context", []))) + " messages"
251
+ })
252
+ return pd.DataFrame(table_data)
253
+ except Exception as e:
254
+ return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]})
255
+
256
+
257
+ def handle_row_select(evt: gr.SelectData):
258
+ """Handle row selection in flagged messages table"""
259
+ try:
260
+ if not os.path.exists("logs/flagged_responses.log"):
261
+ return []
262
+
263
+ with open("logs/flagged_responses.log", "r") as f:
264
+ flagged_messages_log = f.readlines()
265
+
266
+ if evt.index[0] < len(flagged_messages_log):
267
+ selected_entry = json.loads(flagged_messages_log[evt.index[0]])
268
+ conversation_context = selected_entry.get("conversation_context", [])
269
+ return conversation_context
270
+ return []
271
+ except Exception as e:
272
+ return [("System", f"Error loading conversation: {str(e)}")]
273
+
274
+
275
+ def clear_history():
276
+ """Clear chat history"""
277
+ return [], gr.update(choices=["No responses available"], value="No responses available")
278
+
279
+
280
+ # ===== MAIN INTERFACE =====
281
+
282
+ def create_interface():
283
+ with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo:
284
+ gr.Markdown("# 🤖 AI Tasks Evaluation Suite")
285
+ gr.Markdown("Comprehensive platform for AI model evaluation and testing")
286
+
287
+ with gr.Tabs():
288
+ # TAB 1: INFERENCE USE CASE
289
+ with gr.Tab("🚀 Inference Use Case"):
290
+ gr.Markdown("## Model Inference Testing")
291
+
292
+ with gr.Row():
293
+ with gr.Column(scale=1):
294
+ # Task type dropdown
295
+ task_type_dropdown = gr.Dropdown(
296
+ choices=get_task_types(),
297
+ value=get_task_types()[0] if get_task_types() else None,
298
+ label="Task Type",
299
+ info="Select task type to load system prompt"
300
+ )
301
+
302
+ # Inference configuration
303
+ inference_config = gr.Dropdown(
304
+ choices=list(get_inference_configs().keys()),
305
+ value="Optimized for Speed",
306
+ label="Inference Configuration",
307
+ info="Select inference optimization level"
308
+ )
309
+
310
+ with gr.Column(scale=2):
311
+ # System prompt (editable)
312
+ system_prompt = gr.Textbox(
313
+ label="System Prompt (Editable)",
314
+ lines=6,
315
+ max_lines=10,
316
+ placeholder="Select a task type to load system prompt...",
317
+ interactive=True
318
+ )
319
+
320
+ with gr.Row():
321
+ with gr.Column():
322
+ # User input
323
+ user_input = gr.Textbox(
324
+ label="User Input",
325
+ lines=4,
326
+ placeholder="Enter your input here...",
327
+ interactive=True
328
+ )
329
+
330
+ with gr.Column():
331
+ # Model response
332
+ model_response = gr.Textbox(
333
+ label="Model Response",
334
+ lines=8,
335
+ interactive=False
336
+ )
337
+
338
+ with gr.Row():
339
+ submit_btn = gr.Button("🔥 Run Inference", variant="primary", size="lg")
340
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary")
341
+
342
+ # Event handlers for Tab 1
343
+ task_type_dropdown.change(
344
+ fn=get_task_by_type,
345
+ inputs=[task_type_dropdown],
346
+ outputs=[system_prompt]
347
+ )
348
+
349
+ submit_btn.click(
350
+ fn=run_inference,
351
+ inputs=[task_type_dropdown, system_prompt, user_input, inference_config],
352
+ outputs=[model_response]
353
+ )
354
+
355
+ clear_btn.click(
356
+ fn=lambda: ("", "", ""),
357
+ outputs=[system_prompt, user_input, model_response]
358
+ )
359
+
360
+ # TAB 2: EVAL SAMPLES
361
+ with gr.Tab("📊 Eval Samples"):
362
+ gr.Markdown("## Dataset Evaluation Samples")
363
+
364
+ with gr.Row():
365
+ with gr.Column(scale=1):
366
+ eval_dataset_dropdown = gr.Dropdown(
367
+ choices=list(EVAL_DATASETS.keys()),
368
+ value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None,
369
+ label="Select Dataset",
370
+ info="Choose evaluation dataset to view"
371
+ )
372
+
373
+ eval_dataset_info = gr.Markdown(
374
+ get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "")
375
+ )
376
+
377
+ with gr.Row():
378
+ eval_table = gr.Dataframe(
379
+ value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(),
380
+ label="Dataset Table",
381
+ max_height=800,
382
+ min_width=800,
383
+ interactive=False,
384
+ wrap=True,
385
+ show_fullscreen_button=True,
386
+ show_copy_button=True,
387
+ show_row_numbers=True,
388
+ show_search="filter",
389
+ )
390
+
391
+ # Event handlers for Tab 2
392
+ eval_dataset_dropdown.change(
393
+ fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)),
394
+ inputs=[eval_dataset_dropdown],
395
+ outputs=[eval_table, eval_dataset_info]
396
+ )
397
+
398
+ # TAB 3: FLAG RESPONSES
399
+ with gr.Tab("🚩 Flag Responses"):
400
+ gr.Markdown("## Chat Interface with Response Flagging")
401
+
402
+ with gr.Row():
403
+ with gr.Column():
404
+ chat_input = gr.Textbox(placeholder="Ask something...", label="Your Message")
405
+
406
+ with gr.Row():
407
+ chat_submit_btn = gr.Button("Send", variant="primary")
408
+ chat_clear_btn = gr.Button("Clear History", variant="secondary")
409
+
410
+ with gr.Column():
411
+ chat_display = gr.Chatbot(label="Chat History", height=400)
412
+ chat_history_state = gr.State([])
413
+
414
+ gr.Markdown("### Flag Response")
415
+ with gr.Row():
416
+ with gr.Column():
417
+ flagged_message_index = gr.Dropdown(
418
+ label="Select a response to flag",
419
+ choices=["No responses available"],
420
+ value="No responses available",
421
+ interactive=True
422
+ )
423
+
424
+ selected_message_display = gr.Textbox(
425
+ label="Selected Response",
426
+ interactive=False,
427
+ lines=4
428
+ )
429
+
430
+ with gr.Column():
431
+ flag_reason = gr.Textbox(
432
+ placeholder="Enter reason for flagging...",
433
+ label="Reason for Flagging"
434
+ )
435
+
436
+ flag_btn = gr.Button("Flag Response", variant="stop")
437
+ flag_output = gr.Textbox(label="Flagging Feedback", visible=True)
438
+
439
+ # Event handlers for Tab 3
440
+ chat_submit_btn.click(
441
+ chat_interface,
442
+ inputs=[chat_input, chat_history_state],
443
+ outputs=[chat_display, chat_input]
444
+ ).then(
445
+ get_assistant_responses,
446
+ inputs=[chat_history_state],
447
+ outputs=[flagged_message_index]
448
+ )
449
+
450
+ chat_clear_btn.click(
451
+ clear_history,
452
+ outputs=[chat_display, flagged_message_index]
453
+ )
454
+
455
+ flagged_message_index.change(
456
+ display_selected_message,
457
+ inputs=[flagged_message_index, chat_history_state],
458
+ outputs=[selected_message_display]
459
+ )
460
+
461
+ flag_btn.click(
462
+ flag_response,
463
+ inputs=[chat_history_state, flagged_message_index, flag_reason],
464
+ outputs=[flag_output]
465
+ )
466
+
467
+ # TAB 4: VIEW FLAGGED RESPONSES
468
+ with gr.Tab("👀 View Flagged Responses"):
469
+ gr.Markdown("## Review Flagged Responses")
470
+
471
+ with gr.Row():
472
+ with gr.Column():
473
+ flagged_messages_display = gr.Dataframe(
474
+ headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"],
475
+ interactive=False,
476
+ max_height=400
477
+ )
478
+ refresh_btn = gr.Button("🔄 Refresh", variant="primary")
479
+
480
+ with gr.Column():
481
+ conversation_context_display = gr.Chatbot(
482
+ label="Conversation Context",
483
+ height=400
484
+ )
485
+
486
+ # Event handlers for Tab 4
487
+ flagged_messages_display.select(
488
+ handle_row_select,
489
+ outputs=[conversation_context_display]
490
+ )
491
+
492
+ refresh_btn.click(
493
+ read_flagged_messages,
494
+ outputs=[flagged_messages_display]
495
+ )
496
+
497
+ # TAB 5: MODEL EVAL RESULTS
498
+ with gr.Tab("📈 Model Eval Results"):
499
+ gr.Markdown("## Model Evaluation Results")
500
+ gr.Markdown("### 🚧 Coming Soon")
501
+ gr.Markdown(
502
+ "This section will display comprehensive model evaluation metrics, charts, and performance analysis.")
503
+
504
+ # Placeholder content
505
+ with gr.Row():
506
+ with gr.Column():
507
+ gr.Markdown("#### Evaluation Metrics")
508
+ gr.Markdown("- Accuracy scores")
509
+ gr.Markdown("- Performance benchmarks")
510
+ gr.Markdown("- Comparative analysis")
511
+
512
+ with gr.Column():
513
+ gr.Markdown("#### Visualization")
514
+ gr.Markdown("- Performance charts")
515
+ gr.Markdown("- Score distributions")
516
+ gr.Markdown("- Trend analysis")
517
+
518
+ # TAB 6: ABOUT
519
+ with gr.Tab("ℹ️ About"):
520
+ gr.Markdown("## About Loggenix MOE Model")
521
+
522
+ gr.Markdown("""
523
+ ### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool`
524
+
525
+ This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities.
526
+
527
+ #### Key Features:
528
+ - **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters
529
+ - **Training**: Fine-tuned with learning rate 7e-5, batch size 16
530
+ - **Hardware**: Optimized for RTX 4090 GPU
531
+ - **Capabilities**: Tool calling, instruction following, task-specific responses
532
+
533
+ #### Model Specifications:
534
+ - **Total Parameters**: 0.3B
535
+ - **Active Parameters**: 0.1B
536
+ - **Context Length**: 4096 tokens
537
+ - **Precision**: FP16 for optimal performance
538
+ - **Flash Attention**: Supported for faster inference
539
+
540
+ #### Sample Inference Code:
541
+ ```python
542
+ from transformers import AutoModelForCausalLM, AutoTokenizer
543
+ import torch
544
+
545
+ # Load model and tokenizer
546
+ model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool"
547
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
548
+ model = AutoModelForCausalLM.from_pretrained(
549
+ model_id,
550
+ device_map="auto",
551
+ torch_dtype=torch.float16,
552
+ attn_implementation="flash_attention_2"
553
+ ).eval()
554
+
555
+ # Prepare messages
556
+ messages = [
557
+ {"role": "system", "content": "You are a helpful AI assistant."},
558
+ {"role": "user", "content": "Calculate 25 + 37"}
559
+ ]
560
+
561
+ # Format and generate
562
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
563
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
564
+
565
+ with torch.no_grad():
566
+ outputs = model.generate(
567
+ **inputs,
568
+ max_new_tokens=512,
569
+ do_sample=True,
570
+ temperature=0.7,
571
+ pad_token_id=tokenizer.pad_token_id
572
+ )
573
+
574
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
575
+ print(response)
576
+ ```
577
+
578
+ #### Tool Calling Support:
579
+ The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks.
580
+
581
+ #### Performance Optimizations:
582
+ - **Speed Mode**: Max 512 new tokens for fast responses
583
+ - **Balanced Mode**: Max 2048 new tokens for comprehensive answers
584
+ - **Full Capacity**: Dynamic token allocation up to context limit
585
+
586
+ ---
587
+
588
+ **Developed by**: Kshitij Thakkar
589
+ **Version**: v6.2
590
+ **License**: Please check model repository for licensing details
591
+ """)
592
+
593
+ # Load initial data
594
+ demo.load(
595
+ fn=read_flagged_messages,
596
+ outputs=[flagged_messages_display]
597
+ )
598
+
599
+ return demo
600
+
601
+
602
+ # Launch the application
603
+ if __name__ == "__main__":
604
+ print("Starting AI Tasks Evaluation Suite...")
605
+ demo = create_interface()
606
+ demo.launch(
607
+ server_name="0.0.0.0",
608
+ server_port=7860,
609
+ share=False,
610
+ debug=True
611
+ )
enhanced_app.py ADDED
@@ -0,0 +1,745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from datasets import load_dataset
4
+ import plotly.graph_objects as go
5
+ import datetime
6
+ import json
7
+ import random
8
+ import os
9
+ from model_handler import generate_response, get_inference_configs
10
+ import torch
11
+
12
+ # Configuration for datasets
13
+ DATASET_CONFIGS = {
14
+ 'Loggenix Synthetic AI Tasks Eval (with outputs)-small': {
15
+ 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs',
16
+ 'split': 'train'
17
+ },
18
+ 'Loggenix Synthetic AI Tasks Eval (with outputs) v5-large': {
19
+ 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs',
20
+ 'split': 'train'
21
+ },
22
+ 'Loggenix Synthetic AI Tasks Eval (with outputs) v6-large': {
23
+ 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v6-with-outputs',
24
+ 'split': 'train'
25
+ }
26
+ }
27
+
28
+
29
+ # Load main dataset for inference tab
30
+ def load_inference_dataset():
31
+ """Load the main dataset for inference use case"""
32
+ try:
33
+ print("Loading synthetic-ai-tasks-eval-v5 dataset...")
34
+ dataset = load_dataset(
35
+ 'kshitijthakkar/synthetic-ai-tasks-eval-v5',
36
+ split='train',
37
+ trust_remote_code=True
38
+ )
39
+ df = dataset.to_pandas()
40
+ print(f"✓ Successfully loaded: {len(df)} rows, {len(df.columns)} columns")
41
+ return df
42
+ except Exception as e:
43
+ print(f"✗ Error loading dataset: {str(e)}")
44
+ return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']})
45
+
46
+
47
+ # Load dataset for eval samples tab
48
+ def load_eval_datasets():
49
+ """Load all datasets for evaluation samples"""
50
+ datasets = {}
51
+ for display_name, config in DATASET_CONFIGS.items():
52
+ try:
53
+ print(f"Loading {display_name}...")
54
+ dataset = load_dataset(
55
+ config['repo_id'],
56
+ split=config['split'],
57
+ trust_remote_code=True
58
+ )
59
+ df = dataset.to_pandas()
60
+ datasets[display_name] = df
61
+ print(f"✓ Successfully loaded {display_name}: {len(df)} rows")
62
+ except Exception as e:
63
+ print(f"✗ Error loading {display_name}: {str(e)}")
64
+ datasets[display_name] = pd.DataFrame({
65
+ 'Error': [f'Failed to load: {str(e)}'],
66
+ 'Dataset': [config['repo_id']]
67
+ })
68
+ return datasets
69
+
70
+
71
+ # Load datasets
72
+ INFERENCE_DATASET = load_inference_dataset()
73
+ EVAL_DATASETS = load_eval_datasets()
74
+
75
+
76
+ # ===== TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING =====
77
+
78
+ def get_task_types():
79
+ """Get unique task types from inference dataset"""
80
+ if 'task_type' in INFERENCE_DATASET.columns:
81
+ task_types = INFERENCE_DATASET['task_type'].unique().tolist()
82
+ return [str(t) for t in task_types if pd.notna(t)]
83
+ return ["No task types available"]
84
+
85
+
86
+ def get_task_by_type(task_type):
87
+ """Get task content by task type"""
88
+ if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns:
89
+ filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type]
90
+ if len(filtered) > 0:
91
+ return str(filtered.iloc[0]['task'])
92
+ return "No task found for this type"
93
+
94
+
95
+ def chat_interface_with_inference(prompt, history, system_prompt, inference_config):
96
+ """Enhanced chat interface with model inference and history"""
97
+ if not prompt.strip():
98
+ return history, ""
99
+
100
+ # Add user message to history
101
+ history.append(("You", prompt))
102
+
103
+ try:
104
+ if not system_prompt.strip():
105
+ response = "Please select a task type to load system prompt first."
106
+ else:
107
+ # Get inference configuration
108
+ configs = get_inference_configs()
109
+ config = configs.get(inference_config, configs["Optimized for Speed"])
110
+
111
+ # Run inference using the model
112
+ response = generate_response(
113
+ system_prompt=system_prompt,
114
+ user_input=prompt,
115
+ config_name=inference_config
116
+ )
117
+
118
+ # Format and add AI response to history
119
+ formatted_response = f"**AI Assistant:**\n{response}"
120
+ history.append(("AI Assistant", formatted_response))
121
+
122
+ except Exception as e:
123
+ error_msg = f"**AI Assistant:**\nError during inference: {str(e)}"
124
+ history.append(("AI Assistant", error_msg))
125
+
126
+ return history, ""
127
+
128
+
129
+ def flag_response(history, flagged_message, flag_reason):
130
+ """Flag a response"""
131
+ if not flagged_message or flagged_message == "No responses available":
132
+ return "Invalid message selection."
133
+
134
+ try:
135
+ flagged_index = int(flagged_message.split()[1][:-1])
136
+ if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant":
137
+ return "You can only flag assistant responses."
138
+
139
+ flagged_message_content = history[flagged_index][1]
140
+
141
+ log_entry = {
142
+ "timestamp": datetime.datetime.now().isoformat(),
143
+ "flag_reason": str(flag_reason),
144
+ "flagged_message": str(flagged_message_content),
145
+ "conversation_context": history,
146
+ }
147
+
148
+ os.makedirs("logs", exist_ok=True)
149
+ with open("logs/flagged_responses.log", "a") as f:
150
+ f.write(json.dumps(log_entry) + "\n")
151
+
152
+ return f"Response flagged successfully: {flag_reason}"
153
+ except Exception as e:
154
+ return f"Error flagging response: {str(e)}"
155
+
156
+
157
+ def get_assistant_responses(history):
158
+ """Get dropdown options for assistant responses"""
159
+ responses = [
160
+ f"Response {i}: {str(msg[1])[:50]}..."
161
+ for i, msg in enumerate(history)
162
+ if msg[0] == "AI Assistant"
163
+ ]
164
+
165
+ if not responses:
166
+ responses = ["No responses available"]
167
+
168
+ return gr.update(choices=responses, value=responses[0] if responses else "No responses available")
169
+
170
+
171
+ def display_selected_message(selected_index, history):
172
+ """Display the selected flagged message"""
173
+ if selected_index == "No responses available":
174
+ return "No responses available"
175
+
176
+ try:
177
+ flagged_index = int(selected_index.split()[1][:-1])
178
+ if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant":
179
+ return history[flagged_index][1]
180
+ else:
181
+ return "Invalid selection."
182
+ except Exception as e:
183
+ return f"Error: {str(e)}"
184
+
185
+
186
+ def clear_inference_history():
187
+ """Clear chat history for inference tab"""
188
+ return [], gr.update(choices=["No responses available"], value="No responses available")
189
+
190
+
191
+ # ===== TAB 2: EVAL SAMPLES =====
192
+
193
+ def update_eval_table(dataset_name):
194
+ """Update eval table based on selected dataset"""
195
+ if dataset_name in EVAL_DATASETS:
196
+ return EVAL_DATASETS[dataset_name].head(100)
197
+ return pd.DataFrame()
198
+
199
+
200
+ def get_eval_dataset_info(dataset_name):
201
+ """Get info about selected eval dataset"""
202
+ if dataset_name in EVAL_DATASETS:
203
+ df = EVAL_DATASETS[dataset_name]
204
+ return f"""
205
+ **Dataset**: {dataset_name}
206
+ - **Rows**: {len(df):,}
207
+ - **Columns**: {len(df.columns)}
208
+ - **Column Names**: {', '.join(df.columns.tolist())}
209
+ """
210
+ return "No dataset selected"
211
+
212
+
213
+ def get_task_types_for_eval(dataset_name):
214
+ """Get unique task types from selected eval dataset"""
215
+ if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns:
216
+ task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist()
217
+ return [str(t) for t in task_types if pd.notna(t)]
218
+ return ["No task types available"]
219
+
220
+
221
+ def get_tasks_by_type_eval(dataset_name, task_type):
222
+ """Get tasks filtered by dataset and task type"""
223
+ if (dataset_name in EVAL_DATASETS and
224
+ 'task_type' in EVAL_DATASETS[dataset_name].columns and
225
+ 'task' in EVAL_DATASETS[dataset_name].columns):
226
+
227
+ filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type]
228
+ if len(filtered) > 0:
229
+ # Create display options with index and truncated task content
230
+ tasks = []
231
+ for idx, row in filtered.iterrows():
232
+ task_preview = str(row['task'])[:100] + "..." if len(str(row['task'])) > 100 else str(row['task'])
233
+ tasks.append(f"Row {idx}: {task_preview}")
234
+ return tasks
235
+ return ["No tasks found"]
236
+
237
+
238
+ def get_selected_row_data(dataset_name, task_type, selected_task):
239
+ """Get all data for the selected row"""
240
+ if not selected_task or selected_task == "No tasks found":
241
+ return "", "", "", "", "", "",""
242
+
243
+ try:
244
+ # Extract row index from selected_task
245
+ row_idx = int(selected_task.split("Row ")[1].split(":")[0])
246
+
247
+ if dataset_name in EVAL_DATASETS:
248
+ df = EVAL_DATASETS[dataset_name]
249
+ if row_idx in df.index:
250
+ row = df.loc[row_idx]
251
+
252
+ # Extract all fields with safe handling for missing columns
253
+ task = str(row.get('task', 'N/A'))
254
+ task_type_val = str(row.get('task_type', 'N/A'))
255
+ input_model = str(row.get('input_model', 'N/A'))
256
+ expected_response = str(row.get('expected_response', 'N/A'))
257
+ loggenix_output = str(row.get('loggenix_output', 'N/A'))
258
+ output_model = str(row.get('output_model', 'N/A'))
259
+ input_text = str(row.get('input', 'N/A'))
260
+
261
+
262
+ return task_type_val, input_model, output_model, task, input_text, expected_response, loggenix_output
263
+
264
+ except Exception as e:
265
+ return f"Error: {str(e)}", "", "", "", "", "", "", ""
266
+
267
+ return "", "", "", "", "", "", ""
268
+
269
+ # ===== TAB 3: VIEW FLAGGED RESPONSES =====
270
+
271
+ def read_flagged_messages():
272
+ """Read flagged messages from log file"""
273
+ try:
274
+ if not os.path.exists("logs/flagged_responses.log"):
275
+ return pd.DataFrame()
276
+
277
+ with open("logs/flagged_responses.log", "r") as f:
278
+ flagged_messages = f.readlines()
279
+
280
+ if not flagged_messages:
281
+ return pd.DataFrame()
282
+
283
+ table_data = []
284
+ for entry in flagged_messages:
285
+ data = json.loads(entry)
286
+ table_data.append({
287
+ "Timestamp": data.get("timestamp", "N/A"),
288
+ "Flag Reason": data.get("flag_reason", "N/A"),
289
+ "Flagged Message": data.get("flagged_message", "N/A")[:100] + "...",
290
+ "Conversation Context": str(len(data.get("conversation_context", []))) + " messages"
291
+ })
292
+ return pd.DataFrame(table_data)
293
+ except Exception as e:
294
+ return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]})
295
+
296
+
297
+ def handle_row_select(evt: gr.SelectData):
298
+ """Handle row selection in flagged messages table"""
299
+ try:
300
+ if not os.path.exists("logs/flagged_responses.log"):
301
+ return []
302
+
303
+ with open("logs/flagged_responses.log", "r") as f:
304
+ flagged_messages_log = f.readlines()
305
+
306
+ if evt.index[0] < len(flagged_messages_log):
307
+ selected_entry = json.loads(flagged_messages_log[evt.index[0]])
308
+ conversation_context = selected_entry.get("conversation_context", [])
309
+ return conversation_context
310
+ return []
311
+ except Exception as e:
312
+ return [("System", f"Error loading conversation: {str(e)}")]
313
+
314
+
315
+ # ===== MAIN INTERFACE =====
316
+
317
+ def create_interface():
318
+ with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo:
319
+ gr.Markdown("# 🤖 AI Tasks Evaluation Suite")
320
+ gr.Markdown("Comprehensive platform for AI model evaluation and testing")
321
+
322
+ with gr.Tabs():
323
+ # TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING
324
+ with gr.Tab("🚀 Inference Use Case"):
325
+ gr.Markdown("## Model Inference Testing with Response Flagging")
326
+
327
+ with gr.Row():
328
+ with gr.Column(scale=1):
329
+ # Task type dropdown
330
+ task_type_dropdown = gr.Dropdown(
331
+ choices=get_task_types(),
332
+ value=get_task_types()[0] if get_task_types() else None,
333
+ label="Task Type",
334
+ info="Select task type to load system prompt"
335
+ )
336
+
337
+ # Inference configuration
338
+ inference_config = gr.Dropdown(
339
+ choices=list(get_inference_configs().keys()),
340
+ value="Optimized for Speed",
341
+ label="Inference Configuration",
342
+ info="Select inference optimization level"
343
+ )
344
+
345
+ with gr.Column(scale=2):
346
+ # System prompt (editable)
347
+ system_prompt = gr.Textbox(
348
+ label="System Prompt (Editable)",
349
+ lines=6,
350
+ max_lines=10,
351
+ placeholder="Select a task type to load system prompt...",
352
+ interactive=True
353
+ )
354
+
355
+ # Chat interface section
356
+ gr.Markdown("### 💬 Chat Interface")
357
+ with gr.Row():
358
+ with gr.Column(scale=2):
359
+ # Chat display (replacing the old textbox)
360
+ chat_display = gr.Chatbot(label="Conversation History", height=400)
361
+ chat_history_state = gr.State([])
362
+
363
+ # Chat input
364
+ with gr.Row():
365
+ chat_input = gr.Textbox(
366
+ placeholder="Enter your message here...",
367
+ label="Your Message",
368
+ scale=4
369
+ )
370
+ send_btn = gr.Button("Send", variant="primary", scale=1)
371
+
372
+ with gr.Row():
373
+ clear_chat_btn = gr.Button("🗑️ Clear History", variant="secondary")
374
+
375
+ # Flagging section
376
+ with gr.Column(scale=1):
377
+ gr.Markdown("### 🚩 Flag Response")
378
+
379
+ flagged_message_index = gr.Dropdown(
380
+ label="Select a response to flag",
381
+ choices=["No responses available"],
382
+ value="No responses available",
383
+ interactive=True
384
+ )
385
+
386
+ selected_message_display = gr.Textbox(
387
+ label="Selected Response",
388
+ interactive=False,
389
+ lines=4,
390
+ max_lines=6
391
+ )
392
+
393
+ flag_reason = gr.Textbox(
394
+ placeholder="Enter reason for flagging...",
395
+ label="Reason for Flagging"
396
+ )
397
+
398
+ flag_btn = gr.Button("🚩 Flag Response", variant="stop")
399
+ flag_output = gr.Textbox(label="Flagging Status", visible=True, lines=2)
400
+
401
+ # Event handlers for Tab 1
402
+ task_type_dropdown.change(
403
+ fn=get_task_by_type,
404
+ inputs=[task_type_dropdown],
405
+ outputs=[system_prompt]
406
+ )
407
+
408
+ # Chat functionality
409
+ send_btn.click(
410
+ chat_interface_with_inference,
411
+ inputs=[chat_input, chat_history_state, system_prompt, inference_config],
412
+ outputs=[chat_display, chat_input]
413
+ ).then(
414
+ lambda x: x, # Update state
415
+ inputs=[chat_display],
416
+ outputs=[chat_history_state]
417
+ ).then(
418
+ get_assistant_responses,
419
+ inputs=[chat_history_state],
420
+ outputs=[flagged_message_index]
421
+ )
422
+
423
+ # Enter key support for chat input
424
+ chat_input.submit(
425
+ chat_interface_with_inference,
426
+ inputs=[chat_input, chat_history_state, system_prompt, inference_config],
427
+ outputs=[chat_display, chat_input]
428
+ ).then(
429
+ lambda x: x, # Update state
430
+ inputs=[chat_display],
431
+ outputs=[chat_history_state]
432
+ ).then(
433
+ get_assistant_responses,
434
+ inputs=[chat_history_state],
435
+ outputs=[flagged_message_index]
436
+ )
437
+
438
+ clear_chat_btn.click(
439
+ clear_inference_history,
440
+ outputs=[chat_display, flagged_message_index]
441
+ ).then(
442
+ lambda: [],
443
+ outputs=[chat_history_state]
444
+ )
445
+
446
+ # Flagging functionality
447
+ flagged_message_index.change(
448
+ display_selected_message,
449
+ inputs=[flagged_message_index, chat_history_state],
450
+ outputs=[selected_message_display]
451
+ )
452
+
453
+ flag_btn.click(
454
+ flag_response,
455
+ inputs=[chat_history_state, flagged_message_index, flag_reason],
456
+ outputs=[flag_output]
457
+ )
458
+
459
+ # TAB 2: EVAL SAMPLES
460
+ with gr.Tab("📊 Eval Samples"):
461
+ gr.Markdown("## Dataset Evaluation Samples")
462
+
463
+ with gr.Row():
464
+ with gr.Column(scale=1):
465
+ eval_dataset_dropdown = gr.Dropdown(
466
+ choices=list(EVAL_DATASETS.keys()),
467
+ value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None,
468
+ label="Select Dataset",
469
+ info="Choose evaluation dataset to view"
470
+ )
471
+
472
+ eval_dataset_info = gr.Markdown(
473
+ get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "")
474
+ )
475
+
476
+ with gr.Row():
477
+ eval_table = gr.Dataframe(
478
+ value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(),
479
+ label="Dataset Table",
480
+ max_height=800,
481
+ min_width=800,
482
+ interactive=True,
483
+ wrap=True,
484
+ show_fullscreen_button=True,
485
+ show_copy_button=True,
486
+ show_row_numbers=True,
487
+ show_search="search",
488
+ column_widths=["80px","80px","80px","150px","250px","250px","250px"]
489
+ )
490
+
491
+ # Event handlers for Tab 2
492
+ eval_dataset_dropdown.change(
493
+ fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)),
494
+ inputs=[eval_dataset_dropdown],
495
+ outputs=[eval_table, eval_dataset_info]
496
+ )
497
+ with gr.Tab("📊 Eval Samples 2"):
498
+ gr.Markdown("## Dataset Evaluation Samples")
499
+ gr.Markdown("Select dataset, task type, and specific task to view detailed information")
500
+
501
+ with gr.Row():
502
+ with gr.Column(scale=1):
503
+ eval_dataset_dropdown = gr.Dropdown(
504
+ choices=list(EVAL_DATASETS.keys()),
505
+ value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None,
506
+ label="Select Dataset",
507
+ info="Choose evaluation dataset to view"
508
+ )
509
+
510
+ eval_task_type_dropdown = gr.Dropdown(
511
+ choices=[],
512
+ label="Select Task Type",
513
+ info="Choose task type from selected dataset"
514
+ )
515
+
516
+ eval_task_dropdown = gr.Dropdown(
517
+ choices=[],
518
+ label="Select Specific Task",
519
+ info="Choose specific task to view details"
520
+ )
521
+
522
+ with gr.Column(scale=1):
523
+ eval_dataset_info = gr.Markdown(
524
+ get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "")
525
+ )
526
+
527
+ # Task details section
528
+ gr.Markdown("### Task Details")
529
+
530
+ with gr.Row():
531
+ with gr.Column():
532
+ task_field = gr.Textbox(
533
+ label="Task",
534
+ lines=8,
535
+ max_lines=12,
536
+ interactive=False
537
+ )
538
+
539
+ task_type_field = gr.Textbox(
540
+ label="Task Type",
541
+ lines=1,
542
+ interactive=False
543
+ )
544
+
545
+ input_model_field = gr.Textbox(
546
+ label="input_model",
547
+ lines=1,
548
+ interactive=False
549
+ )
550
+
551
+ input_field = gr.Textbox(
552
+ label="input",
553
+ lines=8,
554
+ max_lines=12,
555
+ interactive=False
556
+ )
557
+ output_model_field = gr.Textbox(
558
+ label="output_model",
559
+ lines=1,
560
+ interactive=False
561
+ )
562
+
563
+ # Large text fields for outputs side by side
564
+ gr.Markdown("### Expected vs Actual Response Comparison")
565
+
566
+ with gr.Row():
567
+ expected_response_field = gr.Textbox(
568
+ label="Expected Response",
569
+ lines=30,
570
+ max_lines=40,
571
+ interactive=False
572
+ )
573
+
574
+ loggenix_output_field = gr.Textbox(
575
+ label="Loggenix Output",
576
+ lines=30,
577
+ max_lines=40,
578
+ interactive=False
579
+ )
580
+
581
+ # Event handlers for Tab 2
582
+ eval_dataset_dropdown.change(
583
+ fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x), []),
584
+ inputs=[eval_dataset_dropdown],
585
+ outputs=[eval_dataset_info, eval_task_type_dropdown, eval_task_dropdown]
586
+ )
587
+
588
+ eval_task_type_dropdown.change(
589
+ fn=get_tasks_by_type_eval,
590
+ inputs=[eval_dataset_dropdown, eval_task_type_dropdown],
591
+ outputs=[eval_task_dropdown]
592
+ )
593
+
594
+ eval_task_dropdown.change(
595
+ fn=get_selected_row_data,
596
+ inputs=[eval_dataset_dropdown, eval_task_type_dropdown, eval_task_dropdown],
597
+ outputs=[task_type_field, input_model_field, output_model_field, task_field, input_field,
598
+ loggenix_output_field, expected_response_field]
599
+ )
600
+
601
+ # TAB 3: VIEW FLAGGED RESPONSES (RENAMED FROM TAB 4)
602
+ with gr.Tab("👀 View Flagged Responses"):
603
+ gr.Markdown("## Review Flagged Responses")
604
+
605
+ with gr.Row():
606
+ with gr.Column():
607
+ flagged_messages_display = gr.Dataframe(
608
+ headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"],
609
+ interactive=False,
610
+ max_height=400
611
+ )
612
+ refresh_btn = gr.Button("🔄 Refresh", variant="primary")
613
+
614
+ with gr.Column():
615
+ conversation_context_display = gr.Chatbot(
616
+ label="Conversation Context",
617
+ height=400
618
+ )
619
+
620
+ # Event handlers for Tab 3
621
+ flagged_messages_display.select(
622
+ handle_row_select,
623
+ outputs=[conversation_context_display]
624
+ )
625
+
626
+ refresh_btn.click(
627
+ read_flagged_messages,
628
+ outputs=[flagged_messages_display]
629
+ )
630
+
631
+ # TAB 4: MODEL EVAL RESULTS (MOVED FROM TAB 5)
632
+ with gr.Tab("📈 Model Eval Results"):
633
+ gr.Markdown("## Model Evaluation Results")
634
+ gr.Markdown("### 🚧 Coming Soon")
635
+ gr.Markdown(
636
+ "This section will display comprehensive model evaluation metrics, charts, and performance analysis.")
637
+
638
+ # Placeholder content
639
+ with gr.Row():
640
+ with gr.Column():
641
+ gr.Markdown("#### Evaluation Metrics")
642
+ gr.Markdown("- Accuracy scores")
643
+ gr.Markdown("- Performance benchmarks")
644
+ gr.Markdown("- Comparative analysis")
645
+
646
+ with gr.Column():
647
+ gr.Markdown("#### Visualization")
648
+ gr.Markdown("- Performance charts")
649
+ gr.Markdown("- Score distributions")
650
+ gr.Markdown("- Trend analysis")
651
+
652
+ # TAB 5: ABOUT (MOVED FROM TAB 6)
653
+ with gr.Tab("ℹ️ About"):
654
+ gr.Markdown("## About Loggenix MOE Model")
655
+
656
+ gr.Markdown("""
657
+ ### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool`
658
+
659
+ This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities.
660
+
661
+ #### Key Features:
662
+ - **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters
663
+ - **Training**: Fine-tuned with learning rate 7e-5, batch size 16
664
+ - **Hardware**: Optimized for RTX 4090 GPU
665
+ - **Capabilities**: Tool calling, instruction following, task-specific responses
666
+
667
+ #### Model Specifications:
668
+ - **Total Parameters**: 0.3B
669
+ - **Active Parameters**: 0.1B
670
+ - **Context Length**: 4096 tokens
671
+ - **Precision**: FP16 for optimal performance
672
+ - **Flash Attention**: Supported for faster inference
673
+
674
+ #### Sample Inference Code:
675
+ ```python
676
+ from transformers import AutoModelForCausalLM, AutoTokenizer
677
+ import torch
678
+
679
+ # Load model and tokenizer
680
+ model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool"
681
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
682
+ model = AutoModelForCausalLM.from_pretrained(
683
+ model_id,
684
+ device_map="auto",
685
+ torch_dtype=torch.float16,
686
+ attn_implementation="flash_attention_2"
687
+ ).eval()
688
+
689
+ # Prepare messages
690
+ messages = [
691
+ {"role": "system", "content": "You are a helpful AI assistant."},
692
+ {"role": "user", "content": "Calculate 25 + 37"}
693
+ ]
694
+
695
+ # Format and generate
696
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
697
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
698
+
699
+ with torch.no_grad():
700
+ outputs = model.generate(
701
+ **inputs,
702
+ max_new_tokens=512,
703
+ do_sample=True,
704
+ temperature=0.7,
705
+ pad_token_id=tokenizer.pad_token_id
706
+ )
707
+
708
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
709
+ print(response)
710
+ ```
711
+
712
+ #### Tool Calling Support:
713
+ The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks.
714
+
715
+ #### Performance Optimizations:
716
+ - **Speed Mode**: Max 512 new tokens for fast responses
717
+ - **Balanced Mode**: Max 2048 new tokens for comprehensive answers
718
+ - **Full Capacity**: Dynamic token allocation up to context limit
719
+
720
+ ---
721
+
722
+ **Developed by**: Kshitij Thakkar
723
+ **Version**: v6.2
724
+ **License**: Please check model repository for licensing details
725
+ """)
726
+
727
+ # Load initial data
728
+ demo.load(
729
+ fn=read_flagged_messages,
730
+ outputs=[flagged_messages_display]
731
+ )
732
+
733
+ return demo
734
+
735
+
736
+ # Launch the application
737
+ if __name__ == "__main__":
738
+ print("Starting AI Tasks Evaluation Suite...")
739
+ demo = create_interface()
740
+ demo.launch(
741
+ server_name="0.0.0.0",
742
+ server_port=7860,
743
+ share=False,
744
+ debug=True
745
+ )
model_handler.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+ import gc
4
+ import json
5
+ import re
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
7
+ from typing import Dict, Any, Optional
8
+
9
+ # Performance optimizations
10
+ torch.backends.cudnn.benchmark = True
11
+ torch.backends.cuda.matmul.allow_tf32 = True
12
+ torch.backends.cudnn.allow_tf32 = True
13
+
14
+ # Global model and tokenizer variables
15
+ model = None
16
+ tokenizer = None
17
+ MODEL_ID = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool"
18
+
19
+ # Inference configurations
20
+ INFERENCE_CONFIGS = {
21
+ "Optimized for Speed": {
22
+ "max_new_tokens_base": 512,
23
+ "max_new_tokens_cap": 512,
24
+ "min_tokens": 50,
25
+ "temperature": 0.7,
26
+ "top_p": 0.9,
27
+ "do_sample": True,
28
+ "use_cache": False,
29
+ "description": "Fast responses with limited output length"
30
+ },
31
+ "Middle-ground": {
32
+ "max_new_tokens_base": 2048,
33
+ "max_new_tokens_cap": 2048,
34
+ "min_tokens": 50,
35
+ "temperature": 0.7,
36
+ "top_p": 0.9,
37
+ "do_sample": True,
38
+ "use_cache": False,
39
+ "description": "Balanced performance and output quality"
40
+ },
41
+ "Full Capacity": {
42
+ "max_new_tokens_base": 4096,
43
+ "max_new_tokens_cap": 4096,
44
+ "min_tokens": 1,
45
+ "temperature": 0.7,
46
+ "top_p": 0.9,
47
+ "do_sample": True,
48
+ "use_cache": False,
49
+ "description": "Maximum output length with dynamic allocation"
50
+ }
51
+ }
52
+
53
+
54
+ def get_inference_configs():
55
+ """Get available inference configurations"""
56
+ return INFERENCE_CONFIGS
57
+
58
+
59
+ def load_model():
60
+ """Load model and tokenizer with optimizations"""
61
+ global model, tokenizer
62
+
63
+ if model is not None and tokenizer is not None:
64
+ return model, tokenizer
65
+
66
+ print("Loading tokenizer...")
67
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
68
+
69
+ ## load 8 bit quants
70
+ quantization_config = BitsAndBytesConfig(
71
+ load_in_8bit=True,
72
+ llm_int8_threshold=6.0,
73
+ llm_int8_has_fp16_weight=False,
74
+ )
75
+ # # Or 4-bit for even more memory savings
76
+ # quantization_config = BitsAndBytesConfig(
77
+ # load_in_4bit=True,
78
+ # bnb_4bit_compute_dtype=torch.float16,
79
+ # bnb_4bit_quant_type="nf4",
80
+ # bnb_4bit_use_double_quant=True,
81
+ # )
82
+
83
+ print("Loading model...")
84
+ model = AutoModelForCausalLM.from_pretrained(
85
+ MODEL_ID,
86
+ device_map="auto",
87
+ dtype=torch.float16, # Use half precision for speed
88
+ attn_implementation="flash_attention_2" if hasattr(torch.nn, 'scaled_dot_product_attention') else None,
89
+ use_cache=True,
90
+ quantization_config=quantization_config,
91
+ ).eval()
92
+
93
+ # Enable gradient checkpointing if available
94
+ if hasattr(model, 'gradient_checkpointing_enable'):
95
+ model.gradient_checkpointing_enable()
96
+
97
+ # Set pad_token_id
98
+ if model.config.pad_token_id is None and tokenizer.pad_token_id is not None:
99
+ model.config.pad_token_id = tokenizer.pad_token_id
100
+
101
+ # Set padding side to left for better batching
102
+ tokenizer.padding_side = "left"
103
+
104
+ memory = model.get_memory_footprint() / 1e6
105
+ print(f"Memory footprint: {memory:,.1f} MB")
106
+
107
+ return model, tokenizer
108
+
109
+
110
+ # ===== TOOL DEFINITIONS =====
111
+
112
+ def calculate_numbers(operation: str, num1: float, num2: float) -> Dict[str, Any]:
113
+ """
114
+ Sample tool to perform basic mathematical operations on two numbers.
115
+
116
+ Args:
117
+ operation: The operation to perform ('add', 'subtract', 'multiply', 'divide')
118
+ num1: First number
119
+ num2: Second number
120
+
121
+ Returns:
122
+ Dictionary with result and operation details
123
+ """
124
+ try:
125
+ num1, num2 = float(num1), float(num2)
126
+
127
+ if operation.lower() == 'add':
128
+ result = num1 + num2
129
+ elif operation.lower() == 'subtract':
130
+ result = num1 - num2
131
+ elif operation.lower() == 'multiply':
132
+ result = num1 * num2
133
+ elif operation.lower() == 'divide':
134
+ if num2 == 0:
135
+ return {"error": "Division by zero is not allowed"}
136
+ result = num1 / num2
137
+ else:
138
+ return {"error": f"Unknown operation: {operation}"}
139
+
140
+ return {
141
+ "result": result,
142
+ "operation": operation,
143
+ "operands": [num1, num2],
144
+ "formatted": f"{num1} {operation} {num2} = {result}"
145
+ }
146
+ except ValueError as e:
147
+ return {"error": f"Invalid number format: {str(e)}"}
148
+ except Exception as e:
149
+ return {"error": f"Calculation error: {str(e)}"}
150
+
151
+
152
+ # Tool registry
153
+ AVAILABLE_TOOLS = {
154
+ "calculate_numbers": {
155
+ "function": calculate_numbers,
156
+ "description": "Perform basic mathematical operations (add, subtract, multiply, divide) on two numbers",
157
+ "parameters": {
158
+ "operation": "The mathematical operation to perform",
159
+ "num1": "First number",
160
+ "num2": "Second number"
161
+ }
162
+ }
163
+ }
164
+
165
+
166
+ def execute_tool_call(tool_name: str, **kwargs) -> Dict[str, Any]:
167
+ """Execute a tool call with given parameters"""
168
+ print(f"Executing tool: {tool_name} with parameters: {kwargs}")
169
+ if tool_name not in AVAILABLE_TOOLS:
170
+ return {"error": f"Unknown tool: {tool_name}"}
171
+
172
+ try:
173
+ tool_function = AVAILABLE_TOOLS[tool_name]["function"]
174
+ result = tool_function(**kwargs)
175
+ return {
176
+ "tool_name": tool_name,
177
+ "parameters": kwargs,
178
+ "result": result
179
+ }
180
+ except Exception as e:
181
+ print(f"Tool execution failed: {str(e)}")
182
+ return {
183
+ "tool_name": tool_name,
184
+ "parameters": kwargs,
185
+ "error": f"Tool execution error: {str(e)}"
186
+ }
187
+
188
+
189
+ # def parse_tool_calls(text: str) -> list:
190
+ # """
191
+ # Parse tool calls from model output.
192
+ # Expected format: [TOOL_CALL:tool_name(param1=value1, param2=value2)]
193
+ # """
194
+ # tool_calls = []
195
+ # #pattern = r'\[TOOL_CALL:(\w+)\((.*?)\)\]'
196
+ # pattern = r'(\[TOOL_CALL:(\w+)\((.*?)\)\]|<tool_call>\s*{"name":\s*"(\w+)",\s*"parameters":\s*{([^}]*)}\s*}\s*</tool_call>)'
197
+ # matches = re.findall(pattern, text)
198
+ # print(matches)
199
+ #
200
+ # for tool_name, params_str in matches:
201
+ # try:
202
+ # params = {}
203
+ # if params_str.strip():
204
+ # param_pairs = params_str.split(',')
205
+ # for pair in param_pairs:
206
+ # if '=' in pair:
207
+ # key, value = pair.split('=', 1)
208
+ # key = key.strip()
209
+ # value = value.strip().strip('"\'') # Remove quotes
210
+ # params[key] = value
211
+ # tool_calls.append({
212
+ # "tool_name": tool_name,
213
+ # "parameters": params,
214
+ # "original_call": f"[TOOL_CALL:{tool_name}({params_str})]" # Store original call for replacement
215
+ # })
216
+ # except Exception as e:
217
+ # print(f"Error parsing tool call '{tool_name}({params_str})': {e}")
218
+ # continue
219
+ #
220
+ # return tool_calls
221
+
222
+ def parse_tool_calls(text: str) -> list:
223
+ """
224
+ Parse tool calls from model output.
225
+ Supports both formats:
226
+ - [TOOL_CALL:tool_name(param1=value1, param2=value2)]
227
+ - <tool_call>{"name": "tool_name", "parameters": {"param1": "value1", "param2": "value2"}}</tool_call>
228
+ """
229
+ tool_calls = []
230
+
231
+ # Pattern for both formats
232
+ pattern = r'(\[TOOL_CALL:(\w+)\((.*?)\)\]|<tool_call>\s*{"name":\s*"(\w+)",\s*"parameters":\s*{([^}]*)}\s*}\s*</tool_call>)'
233
+ matches = re.findall(pattern, text)
234
+ print("Raw matches:", matches)
235
+
236
+ for match in matches:
237
+ full_match, old_tool_name, old_params, json_tool_name, json_params = match
238
+
239
+ # Determine which format was matched
240
+ if old_tool_name: # Old format: [TOOL_CALL:tool_name(params)]
241
+ tool_name = old_tool_name
242
+ params_str = old_params
243
+ original_call = f"[TOOL_CALL:{tool_name}({params_str})]"
244
+
245
+ try:
246
+ params = {}
247
+ if params_str.strip():
248
+ param_pairs = params_str.split(',')
249
+ for pair in param_pairs:
250
+ if '=' in pair:
251
+ key, value = pair.split('=', 1)
252
+ key = key.strip()
253
+ value = value.strip().strip('"\'') # Remove quotes
254
+ params[key] = value
255
+
256
+ tool_calls.append({
257
+ "tool_name": tool_name,
258
+ "parameters": params,
259
+ "original_call": original_call
260
+ })
261
+
262
+ except Exception as e:
263
+ print(f"Error parsing old format tool call '{tool_name}({params_str})': {e}")
264
+ continue
265
+
266
+ elif json_tool_name: # JSON format: <tool_call>...</tool_call>
267
+ tool_name = json_tool_name
268
+ params_str = json_params
269
+ original_call = full_match
270
+
271
+ try:
272
+ params = {}
273
+ if params_str.strip():
274
+ # Parse JSON-like parameters
275
+ # Handle the format: "operation": "add", "num1": "125", "num2": "675"
276
+ param_pairs = params_str.split(',')
277
+ for pair in param_pairs:
278
+ if ':' in pair:
279
+ key, value = pair.split(':', 1)
280
+ key = key.strip().strip('"\'') # Remove quotes and whitespace
281
+ value = value.strip().strip('"\'') # Remove quotes and whitespace
282
+ params[key] = value
283
+
284
+ tool_calls.append({
285
+ "tool_name": tool_name,
286
+ "parameters": params,
287
+ "original_call": original_call
288
+ })
289
+
290
+ except Exception as e:
291
+ print(f"Error parsing JSON format tool call '{tool_name}': {e}")
292
+ continue
293
+
294
+ return tool_calls
295
+
296
+ def process_tool_calls(text: str) -> str:
297
+ """Process tool calls in the generated text and replace with results"""
298
+ tool_calls = parse_tool_calls(text)
299
+
300
+ if not tool_calls:
301
+ return text
302
+
303
+ processed_text = text
304
+
305
+ for tool_call in tool_calls:
306
+ tool_name = tool_call["tool_name"]
307
+ parameters = tool_call["parameters"]
308
+ original_call = tool_call["original_call"]
309
+
310
+ try:
311
+ # Validate parameters before execution
312
+ if not isinstance(parameters, dict):
313
+ raise ValueError(f"Invalid parameters for tool {tool_name}: {parameters}")
314
+
315
+ # Execute tool
316
+ result = execute_tool_call(tool_name, **parameters)
317
+
318
+ # Create replacement text
319
+ if "error" in result:
320
+ replacement = f"[TOOL_ERROR: {result['error']}]"
321
+ else:
322
+ if "result" in result["result"]:
323
+ replacement = f"[TOOL_RESULT: {result['result']['formatted']}]"
324
+ else:
325
+ replacement = f"[TOOL_RESULT: {result['result']}]"
326
+
327
+ # Replace tool call with result
328
+ processed_text = processed_text.replace(original_call, replacement)
329
+
330
+ except Exception as e:
331
+ print(f"Error processing tool call '{tool_name}': {e}")
332
+ replacement = f"[TOOL_ERROR: Failed to process tool call: {str(e)}]"
333
+ processed_text = processed_text.replace(original_call, replacement)
334
+
335
+ return processed_text
336
+
337
+ def monitor_memory():
338
+ if torch.cuda.is_available():
339
+ allocated = torch.cuda.memory_allocated() / 1e9
340
+ cached = torch.cuda.memory_reserved() / 1e9
341
+ print(f"GPU Memory - Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB")
342
+
343
+ def generate_response(system_prompt: str, user_input: str, config_name: str = "Middle-ground") -> str:
344
+ """
345
+ Run inference with the given task (system prompt) and user input using the specified config.
346
+ """
347
+ load_model()
348
+
349
+ config = INFERENCE_CONFIGS[config_name]
350
+
351
+ input_messages = [
352
+ {"role": "system", "content": system_prompt},
353
+ {"role": "user", "content": user_input}
354
+ ]
355
+
356
+ prompt_text = tokenizer.apply_chat_template(
357
+ input_messages,
358
+ tokenize=False,
359
+ add_generation_prompt=True
360
+ )
361
+
362
+ input_length = len(tokenizer.encode(prompt_text))
363
+ context_length = min(input_length, 3584) # Leave room for generation
364
+
365
+ inputs = tokenizer(
366
+ prompt_text,
367
+ return_tensors="pt",
368
+ truncation=True,
369
+ max_length=context_length,
370
+ padding=False
371
+ ).to(model.device)
372
+
373
+ actual_input_length = inputs['input_ids'].shape[1]
374
+ max_new_tokens = min(config["max_new_tokens_cap"], 4096 - actual_input_length - 10)
375
+ max_new_tokens = max(config["min_tokens"], max_new_tokens)
376
+
377
+ with torch.no_grad():
378
+ start_time = time.time()
379
+ outputs = model.generate(
380
+ **inputs,
381
+ do_sample=config["do_sample"],
382
+ temperature=config["temperature"],
383
+ top_p=config["top_p"],
384
+ use_cache=config["use_cache"],
385
+ max_new_tokens=max_new_tokens,
386
+ pad_token_id=tokenizer.pad_token_id,
387
+ eos_token_id=tokenizer.eos_token_id,
388
+ # Memory optimizations
389
+ output_attentions=False,
390
+ output_hidden_states=False,
391
+ return_dict_in_generate=False,
392
+ )
393
+ inference_time = time.time() - start_time
394
+ print(f"Inference time: {inference_time:.2f} seconds")
395
+
396
+ memory = model.get_memory_footprint() / 1e6
397
+ monitor_memory()
398
+ print(f"Memory footprint: {memory:,.1f} MB")
399
+
400
+ # Clean up
401
+ gc.collect()
402
+
403
+ full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
404
+ if prompt_text in full_text:
405
+ response_start = full_text.find(prompt_text) + len(prompt_text)
406
+ generated_response = full_text[response_start:].strip()
407
+ else:
408
+ # More robust fallback: try to extract response after the last user message
409
+ generated_response = full_text.strip()
410
+ try:
411
+ # Look for common assistant/response indicators
412
+ response_indicators = ["Assistant:", "<|assistant|>", "[/INST]", "Response:"]
413
+ for indicator in response_indicators:
414
+ if indicator in full_text:
415
+ parts = full_text.split(indicator)
416
+ if len(parts) > 1:
417
+ generated_response = parts[-1].strip()
418
+ break
419
+
420
+ # If no indicator found, try to remove the input part
421
+ user_message = user_input
422
+ if user_message in full_text:
423
+ parts = full_text.split(user_message)
424
+ if len(parts) > 1:
425
+ generated_response = parts[-1].strip()
426
+ except Exception:
427
+ generated_response = full_text.strip()
428
+
429
+ # Process any tool calls in the generated response
430
+ generated_response = process_tool_calls(generated_response)
431
+ # print('Input tokens:', inputs.input_ids.numel())
432
+ #print('Output tokens:', outputs.input_ids.numel())
433
+ # print('Output tokens:', outputs['input_ids'].numel())
434
+ return generated_response
requirements.txt ADDED
Binary file (3.03 kB). View file