sammoftah commited on
Commit
c812306
·
verified ·
1 Parent(s): eec140e

Deploy Benchmark Builder

Browse files
Files changed (6) hide show
  1. README.md +54 -5
  2. app.py +308 -0
  3. requirements.txt +2 -0
  4. shared/components.py +375 -0
  5. shared/styles.css +425 -0
  6. shared/utils.py +366 -0
README.md CHANGED
@@ -1,12 +1,61 @@
1
  ---
2
  title: Benchmark Builder
3
- emoji: 🚀
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.13.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Benchmark Builder
3
+ emoji: 📊
4
+ colorFrom: yellow
5
+ colorTo: blue
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
9
+ license: mit
10
  ---
11
 
12
+ # Benchmark Builder
13
+
14
+ ## Question
15
+
16
+ How do we create small evaluation datasets without filling them with weak distractors?
17
+
18
+ ## System Boundary
19
+
20
+ This Space is an evaluation-data workbench for multiple-choice questions. It helps author questions, generate distractors, audit answer quality, and export the result.
21
+
22
+ ## Method
23
+
24
+ The app accepts a question, correct answer, subject, difficulty, and rationale. It uses Hugging Face inference when `HF_TOKEN` is available and a deterministic fallback otherwise. It then audits duplicate choices, answer leakage, length balance, and question-stem quality.
25
+
26
+ ## Technique
27
+
28
+ This is evaluation-set construction. The system treats each question as a data object with a correct answer, distractors, rationale, and quality checks.
29
+
30
+ The distractor audit is important because weak distractors inflate model scores and make a benchmark look easier than it is.
31
+
32
+ ## Output
33
+
34
+ The app returns a benchmark item preview, quality-check table, JSON, JSONL, or a Hugging Face Dataset push script.
35
+
36
+ ## Why It Matters
37
+
38
+ Evaluation quality is a bottleneck in LLM work. Small, inspectable benchmarks are often more useful than large opaque ones.
39
+
40
+ ## What To Notice
41
+
42
+ Good distractors should be plausible but wrong. If one option is obviously different in length, style, or vocabulary, the benchmark is leaking hints.
43
+
44
+ ## Effect In Practice
45
+
46
+ This workflow can help teams build focused evals for retrieval, domain knowledge, safety, or product-specific behavior before running model comparisons.
47
+
48
+ ## Hugging Face Extension
49
+
50
+ The generated examples can become a Hub Dataset with splits, dataset card, baseline model scores, and a Space leaderboard.
51
+
52
+ ## Limitations
53
+
54
+ Generated distractors still need human review. Real benchmarks should include calibration, held-out validation, model baselines, and documentation of dataset scope.
55
+
56
+ ## Run Locally
57
+
58
+ ```bash
59
+ pip install -r requirements.txt
60
+ python app.py
61
+ ```
app.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark Builder
3
+ Create small, auditable multiple-choice evaluation datasets for LLMs.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import re
9
+ import sys
10
+ from typing import Dict, List, Tuple
11
+
12
+ import gradio as gr
13
+
14
+ sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
15
+ from shared.components import create_footer, create_method_panel, create_premium_hero
16
+
17
+
18
+ try:
19
+ from huggingface_hub import InferenceClient
20
+ except Exception: # pragma: no cover - optional on local machines
21
+ InferenceClient = None
22
+
23
+
24
+ SEED_QUESTIONS = [
25
+ {
26
+ "question": "Which retrieval signal is strongest when a user query uses rare exact terms?",
27
+ "correct_answer": "BM25 lexical matching",
28
+ "distractors": [
29
+ "Random negative sampling",
30
+ "Temperature scaling",
31
+ "Decoder-only attention masking",
32
+ ],
33
+ "subject": "Information Retrieval",
34
+ "difficulty": "Medium",
35
+ "rationale": "BM25 rewards rare exact terms through inverse document frequency.",
36
+ }
37
+ ]
38
+
39
+ DOMAIN_DISTRACTORS = {
40
+ "Machine Learning": [
41
+ "dropout regularization",
42
+ "batch normalization",
43
+ "cosine learning-rate decay",
44
+ "gradient clipping",
45
+ "early stopping",
46
+ "teacher forcing",
47
+ ],
48
+ "Information Retrieval": [
49
+ "dense vector search",
50
+ "query expansion",
51
+ "reciprocal rank fusion",
52
+ "cross-encoder reranking",
53
+ "metadata filtering",
54
+ "semantic chunking",
55
+ ],
56
+ "AI Safety": [
57
+ "output filtering",
58
+ "least-privilege tool access",
59
+ "prompt isolation",
60
+ "red-team evaluation",
61
+ "policy classification",
62
+ "adversarial testing",
63
+ ],
64
+ "Data Engineering": [
65
+ "schema validation",
66
+ "deduplication",
67
+ "entity resolution",
68
+ "partition pruning",
69
+ "incremental backfills",
70
+ "lineage tracking",
71
+ ],
72
+ }
73
+
74
+
75
+ def _hf_generate(prompt: str) -> List[str]:
76
+ """Use HF Inference when configured; otherwise return an empty list."""
77
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
78
+ if not token or InferenceClient is None:
79
+ return []
80
+
81
+ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token)
82
+ response = client.text_generation(prompt, max_new_tokens=220, temperature=0.35)
83
+ match = re.search(r"\[[\s\S]*\]", response)
84
+ if not match:
85
+ return []
86
+ try:
87
+ parsed = json.loads(match.group(0))
88
+ except json.JSONDecodeError:
89
+ return []
90
+ return [str(item).strip() for item in parsed if str(item).strip()][:3]
91
+
92
+
93
+ def _fallback_distractors(question: str, correct_answer: str, subject: str) -> List[str]:
94
+ pool = DOMAIN_DISTRACTORS.get(subject, DOMAIN_DISTRACTORS["Machine Learning"])
95
+ answer_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", correct_answer)}
96
+ chosen = []
97
+ for candidate in pool:
98
+ candidate_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", candidate)}
99
+ if candidate.lower() != correct_answer.lower() and not answer_terms.intersection(candidate_terms):
100
+ chosen.append(candidate)
101
+ if len(chosen) == 3:
102
+ break
103
+
104
+ if len(chosen) < 3:
105
+ chosen.extend(["a plausible baseline method", "a purely random heuristic", "a manual review process"])
106
+
107
+ if re.search(r"\bwhich\b|\bwhat\b", question.lower()):
108
+ return [item[:1].upper() + item[1:] for item in chosen[:3]]
109
+ return chosen[:3]
110
+
111
+
112
+ def generate_distractors(question: str, correct_answer: str, subject: str, difficulty: str) -> Tuple[List[str], str]:
113
+ prompt = f"""
114
+ You create benchmark-quality multiple-choice distractors.
115
+ Return only a JSON array of exactly 3 short wrong answers.
116
+ Subject: {subject}
117
+ Difficulty: {difficulty}
118
+ Question: {question}
119
+ Correct answer: {correct_answer}
120
+ Distractors must be plausible, mutually distinct, and not reveal the answer.
121
+ """
122
+ generated = _hf_generate(prompt)
123
+ source = "HF Inference model" if len(generated) == 3 else "deterministic local heuristic"
124
+ distractors = generated if len(generated) == 3 else _fallback_distractors(question, correct_answer, subject)
125
+ return distractors, source
126
+
127
+
128
+ def audit_question(question: str, correct_answer: str, distractors: List[str]) -> List[Dict[str, str]]:
129
+ options = [correct_answer] + distractors
130
+ lower_options = [option.strip().lower() for option in options]
131
+ checks = []
132
+
133
+ checks.append({
134
+ "check": "Duplicate options",
135
+ "result": "pass" if len(set(lower_options)) == len(lower_options) else "review",
136
+ "detail": "All answer choices are unique." if len(set(lower_options)) == len(lower_options) else "At least two choices are identical.",
137
+ })
138
+
139
+ answer_words = set(re.findall(r"[a-zA-Z]{4,}", correct_answer.lower()))
140
+ leaked = any(answer_words.intersection(set(re.findall(r"[a-zA-Z]{4,}", d.lower()))) for d in distractors)
141
+ checks.append({
142
+ "check": "Answer leakage",
143
+ "result": "review" if leaked else "pass",
144
+ "detail": "A distractor shares key answer terms." if leaked else "Distractors avoid obvious answer words.",
145
+ })
146
+
147
+ lengths = [len(option) for option in options]
148
+ balanced = max(lengths) - min(lengths) <= max(18, len(correct_answer))
149
+ checks.append({
150
+ "check": "Length balance",
151
+ "result": "pass" if balanced else "review",
152
+ "detail": "Choices have comparable length." if balanced else "One option is much longer or shorter than the rest.",
153
+ })
154
+
155
+ stem_ok = len(question.strip()) >= 24 and question.strip().endswith("?")
156
+ checks.append({
157
+ "check": "Question stem",
158
+ "result": "pass" if stem_ok else "review",
159
+ "detail": "Stem is specific and phrased as a question." if stem_ok else "Make the stem more specific and end it with a question mark.",
160
+ })
161
+ return checks
162
+
163
+
164
+ def render_question(question_data: Dict[str, object]) -> str:
165
+ letters = "ABCD"
166
+ options = [question_data["correct_answer"]] + question_data["distractors"]
167
+ option_html = ""
168
+ for idx, option in enumerate(options):
169
+ is_answer = option == question_data["correct_answer"]
170
+ option_html += f"""
171
+ <div class="info-card" style="margin:0.55rem 0; border-left:4px solid {'#22c55e' if is_answer else '#e5e7eb'} !important;">
172
+ <strong>{letters[idx]}.</strong> {option}
173
+ {'<span style="float:right; color:#15803d; font-weight:800;">answer</span>' if is_answer else ''}
174
+ </div>
175
+ """
176
+
177
+ return f"""
178
+ <div class="info-card">
179
+ <p style="margin:0 0 0.4rem 0; color:#e8935c; font-weight:800;">{question_data['subject']} · {question_data['difficulty']}</p>
180
+ <h3 style="margin-top:0;">{question_data['question']}</h3>
181
+ {option_html}
182
+ <p><strong>Rationale:</strong> {question_data['rationale']}</p>
183
+ <p><strong>Distractor source:</strong> {question_data['source']}</p>
184
+ </div>
185
+ """
186
+
187
+
188
+ def render_audit(checks: List[Dict[str, str]]) -> str:
189
+ lines = ["| Check | Result | Detail |", "|---|---|---|"]
190
+ for check in checks:
191
+ badge = "Pass" if check["result"] == "pass" else "Review"
192
+ lines.append(f"| {check['check']} | {badge} | {check['detail']} |")
193
+ return "\n".join(lines)
194
+
195
+
196
+ def add_question(
197
+ question: str,
198
+ correct_answer: str,
199
+ subject: str,
200
+ difficulty: str,
201
+ rationale: str,
202
+ state: List[Dict[str, object]],
203
+ ):
204
+ if not question or not correct_answer:
205
+ return state, "Add a question and correct answer first.", "", ""
206
+
207
+ distractors, source = generate_distractors(question, correct_answer, subject, difficulty)
208
+ item = {
209
+ "question": question.strip(),
210
+ "correct_answer": correct_answer.strip(),
211
+ "distractors": distractors,
212
+ "subject": subject,
213
+ "difficulty": difficulty,
214
+ "rationale": rationale.strip() or "Add a short rationale before publishing this benchmark.",
215
+ "source": source,
216
+ }
217
+ next_state = [*state, item]
218
+ checks = audit_question(item["question"], item["correct_answer"], item["distractors"])
219
+ status = f"Dataset now has {len(next_state)} questions. Review flags before publishing."
220
+ return next_state, status, render_question(item), render_audit(checks)
221
+
222
+
223
+ def export_benchmark(benchmark_name: str, state: List[Dict[str, object]], export_format: str) -> str:
224
+ safe_name = re.sub(r"[^a-zA-Z0-9_\-/]", "-", benchmark_name.strip() or "my-eval-benchmark")
225
+ rows = state or SEED_QUESTIONS
226
+
227
+ if export_format == "JSONL":
228
+ return "\n".join(json.dumps(row, ensure_ascii=False) for row in rows)
229
+
230
+ if export_format == "HF Dataset Script":
231
+ return f"""from datasets import Dataset
232
+
233
+ rows = {json.dumps(rows, indent=2)}
234
+ dataset = Dataset.from_list(rows)
235
+ dataset.push_to_hub("{safe_name}")
236
+ """
237
+
238
+ return json.dumps({"name": safe_name, "questions": rows, "total": len(rows)}, indent=2)
239
+
240
+
241
+ with gr.Blocks(title="Benchmark Builder", theme=gr.themes.Soft()) as app:
242
+ state = gr.State(SEED_QUESTIONS)
243
+ create_premium_hero(
244
+ "Benchmark Builder",
245
+ "Design small, inspectable evaluation sets with plausible distractors, quality checks, and Hugging Face Dataset export.",
246
+ "📊",
247
+ badge="Evaluation Engineering",
248
+ highlights=["Optional HF inference", "Distractor audit", "Dataset push script"],
249
+ )
250
+ create_method_panel({
251
+ "Technique": "LLM-assisted benchmark authoring with deterministic guardrails.",
252
+ "What it proves": "You understand evaluation data quality, not just prompt generation.",
253
+ "HF capability": "Ready to publish as a Dataset and evaluate models on the Hub.",
254
+ })
255
+
256
+ with gr.Row():
257
+ with gr.Column(scale=1):
258
+ question_input = gr.Textbox(
259
+ label="Question",
260
+ value="Which retrieval signal is strongest when a user query uses rare exact terms?",
261
+ lines=3,
262
+ )
263
+ answer_input = gr.Textbox(label="Correct answer", value="BM25 lexical matching")
264
+ subject = gr.Dropdown(
265
+ choices=list(DOMAIN_DISTRACTORS.keys()),
266
+ value="Information Retrieval",
267
+ label="Subject",
268
+ )
269
+ difficulty = gr.Radio(["Easy", "Medium", "Hard"], value="Medium", label="Difficulty")
270
+ rationale = gr.Textbox(
271
+ label="Rationale",
272
+ value="BM25 rewards rare exact terms through inverse document frequency.",
273
+ lines=2,
274
+ )
275
+ add_btn = gr.Button("Generate Distractors + Audit", variant="primary")
276
+ with gr.Column(scale=1):
277
+ status_output = gr.Markdown("Add a question to generate a benchmark-ready item.")
278
+ preview_output = gr.HTML(render_question({**SEED_QUESTIONS[0], "source": "seed example"}))
279
+ audit_output = gr.Markdown(render_audit(audit_question(
280
+ SEED_QUESTIONS[0]["question"],
281
+ SEED_QUESTIONS[0]["correct_answer"],
282
+ SEED_QUESTIONS[0]["distractors"],
283
+ )))
284
+
285
+ add_btn.click(
286
+ add_question,
287
+ inputs=[question_input, answer_input, subject, difficulty, rationale, state],
288
+ outputs=[state, status_output, preview_output, audit_output],
289
+ )
290
+
291
+ gr.Markdown("## Export")
292
+ with gr.Row():
293
+ benchmark_name = gr.Textbox(label="Hub dataset name", value="username/retrieval-mini-eval")
294
+ export_format = gr.Dropdown(["JSON", "JSONL", "HF Dataset Script"], value="JSON", label="Format")
295
+ export_btn = gr.Button("Generate Export", variant="secondary")
296
+ export_output = gr.Code(label="Benchmark artifact", language="python", lines=16)
297
+ export_btn.click(export_benchmark, inputs=[benchmark_name, state, export_format], outputs=export_output)
298
+
299
+ gr.Markdown("""
300
+ ## Why This Is Useful
301
+
302
+ Evaluation sets fail quietly when distractors are weak, duplicated, or reveal the answer. This Space teaches a better workflow: generate candidates, audit them, keep rationales, and publish the result as a versioned Hugging Face Dataset.
303
+ """)
304
+ create_footer("Benchmark Builder")
305
+
306
+
307
+ if __name__ == "__main__":
308
+ app.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=4.0.0
2
+ huggingface-hub>=0.25.0
shared/components.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HF-Master Shared Components
3
+ Reusable Gradio components for all projects
4
+ """
5
+
6
+ import html
7
+ from pathlib import Path
8
+ import gradio as gr
9
+ from typing import List, Tuple, Optional, Dict, Any
10
+
11
+
12
+ class SharedComponents:
13
+ """Shared UI components for all HF-Master projects"""
14
+
15
+ @staticmethod
16
+ def _esc(value: Any) -> str:
17
+ """Escape text before placing it inside shared HTML components."""
18
+ return html.escape(str(value), quote=True)
19
+
20
+ @staticmethod
21
+ def _style_tag() -> str:
22
+ """Inline the shared stylesheet for Gradio Spaces that do not pass css=."""
23
+ style_path = Path(__file__).with_name("styles.css")
24
+ try:
25
+ return f"<style>{style_path.read_text(encoding='utf-8')}</style>"
26
+ except OSError:
27
+ return ""
28
+
29
+ @staticmethod
30
+ def create_header(title: str, description: str, emoji: str = "🚀") -> gr.Markdown:
31
+ """Create standardized project header"""
32
+ return SharedComponents.create_premium_hero(
33
+ title=title,
34
+ description=description,
35
+ emoji=emoji,
36
+ badge="AI/ML Space",
37
+ highlights=["Interactive demo", "Explainable workflow", "HF-ready"],
38
+ )
39
+
40
+ @staticmethod
41
+ def create_footer(version: str = "1.0.0") -> gr.Markdown:
42
+ """Create standardized project footer"""
43
+ return gr.Markdown(f"""
44
+ <div class="hf-footer">
45
+ <strong>{SharedComponents._esc(version)}</strong> · Built as a practical AI/ML learning Space for the Hugging Face community.
46
+ </div>
47
+ """)
48
+
49
+ @staticmethod
50
+ def create_premium_hero(
51
+ title: str,
52
+ description: str,
53
+ emoji: str = "🚀",
54
+ badge: str = "Featured Space",
55
+ highlights: Optional[List[str]] = None,
56
+ ) -> gr.HTML:
57
+ """Create a richer landing-page hero for Spaces."""
58
+ highlights = highlights or []
59
+ esc = SharedComponents._esc
60
+ chips = "".join(
61
+ f"""
62
+ <span class="hf-chip">{esc(item)}</span>
63
+ """
64
+ for item in highlights
65
+ )
66
+
67
+ return gr.HTML(f"""
68
+ {SharedComponents._style_tag()}
69
+ <div class="hf-hero">
70
+ <div class="hf-hero-grid">
71
+ <div class="hf-hero-copy">
72
+ <div class="hf-icon">{esc(emoji)}</div>
73
+ <div>
74
+ <div class="hf-badge">{esc(badge)}</div>
75
+ <h1>{esc(title)}</h1>
76
+ <p>{esc(description)}</p>
77
+ </div>
78
+ </div>
79
+ </div>
80
+ <div class="hf-chip-row">
81
+ {chips}
82
+ </div>
83
+ </div>
84
+ """)
85
+
86
+ @staticmethod
87
+ def create_method_panel(items: Dict[str, str]) -> gr.HTML:
88
+ """Create a compact method/pipeline explainer panel."""
89
+ esc = SharedComponents._esc
90
+ cards = "".join(
91
+ f"""
92
+ <div class="hf-method-card">
93
+ <span>{esc(label)}</span>
94
+ <p>{esc(text)}</p>
95
+ </div>
96
+ """
97
+ for label, text in items.items()
98
+ )
99
+ return gr.HTML(f"""{SharedComponents._style_tag()}<div class="hf-method-grid">{cards}</div>""")
100
+
101
+ @staticmethod
102
+ def create_status_badge(status: str) -> str:
103
+ """Create status badge"""
104
+ colors = {
105
+ "complete": "🟢",
106
+ "in-progress": "🟡",
107
+ "planned": "⚪",
108
+ "experimental": "🔴"
109
+ }
110
+ return colors.get(status.lower(), "⚪")
111
+
112
+ @staticmethod
113
+ def create_project_card(
114
+ title: str,
115
+ description: str,
116
+ tech_stack: List[str],
117
+ difficulty: str,
118
+ viral_potential: str
119
+ ) -> str:
120
+ """Create markdown project card"""
121
+ tech_badges = " ".join([f"`{t}`" for t in tech_stack])
122
+
123
+ return f"""
124
+ ## {title}
125
+
126
+ {description}
127
+
128
+ **Tech Stack:** {tech_badges}
129
+
130
+ **Difficulty:** {difficulty} | **Viral Potential:** {viral_potential}
131
+ """
132
+
133
+ @staticmethod
134
+ def create_risk_chart(risk_factors: Dict[str, float]) -> Any:
135
+ """Create risk factor visualization"""
136
+ import plotly.graph_objects as go
137
+
138
+ factors = list(risk_factors.keys())
139
+ scores = [risk_factors[f] * 100 for f in factors]
140
+
141
+ fig = go.Figure(data=[
142
+ go.Bar(
143
+ x=scores,
144
+ y=[f.replace('_', ' ').title() for f in factors],
145
+ orientation='h',
146
+ marker=dict(
147
+ color=scores,
148
+ colorscale='RdYlGn_r',
149
+ cmin=0,
150
+ cmax=100
151
+ )
152
+ )
153
+ ])
154
+
155
+ fig.update_layout(
156
+ title="Risk Factor Breakdown",
157
+ xaxis_title="Risk Score",
158
+ yaxis_title="Factor",
159
+ height=400,
160
+ template="plotly_white"
161
+ )
162
+
163
+ return fig
164
+
165
+ @staticmethod
166
+ def create_comparison_chart(items: List[Dict], keys: List[str]) -> Any:
167
+ """Create comparison visualization"""
168
+ import plotly.graph_objects as go
169
+
170
+ fig = go.Figure()
171
+
172
+ for i, item in enumerate(items):
173
+ fig.add_trace(go.Bar(
174
+ name=item.get('name', f'Item {i+1}'),
175
+ x=keys,
176
+ y=[item.get(k, 0) for k in keys]
177
+ ))
178
+
179
+ fig.update_layout(
180
+ barmode='group',
181
+ height=400
182
+ )
183
+
184
+ return fig
185
+
186
+ @staticmethod
187
+ def create_metric_card(label: str, value: str, emoji: str = "📊") -> gr.Markdown:
188
+ """Create metric display card"""
189
+ return gr.Markdown(f"""
190
+ ### {emoji} {label}
191
+
192
+ **{value}**
193
+ """)
194
+
195
+ @staticmethod
196
+ def create_error_display(error: str) -> gr.Markdown:
197
+ """Create error message display"""
198
+ return gr.Markdown(f"""
199
+ ❌ **Error**
200
+
201
+ {error}
202
+ """)
203
+
204
+ @staticmethod
205
+ def create_success_display(message: str) -> gr.Markdown:
206
+ """Create success message display"""
207
+ return gr.Markdown(f"""
208
+ ✅ **Success**
209
+
210
+ {message}
211
+ """)
212
+
213
+
214
+ class LoadingSpinner:
215
+ """Loading state display"""
216
+
217
+ @staticmethod
218
+ def create_spinner(message: str = "Loading...") -> gr.Markdown:
219
+ """Create loading spinner"""
220
+ return gr.Markdown(f"""
221
+ ⏳ **{message}**
222
+
223
+ _This may take a moment..._
224
+ """)
225
+
226
+ @staticmethod
227
+ def create_progress_bar(initial: float = 0) -> gr.Markdown:
228
+ """Create progress display"""
229
+ return gr.Markdown(f"""
230
+ ░░░░░░░░░ **{initial}%**
231
+ """)
232
+
233
+
234
+ class TableFormatter:
235
+ """Format data as tables"""
236
+
237
+ @staticmethod
238
+ def format_dict_table(data: Dict[str, Any], headers: List[str] = None) -> List:
239
+ """Format dictionary as table rows"""
240
+ if not headers:
241
+ headers = ["Key", "Value"]
242
+
243
+ rows = []
244
+ for key, value in data.items():
245
+ rows.append([key, str(value)])
246
+
247
+ return [headers] + rows
248
+
249
+ @staticmethod
250
+ def create_dataframe(data: List[Dict], columns: List[str] = None) -> List:
251
+ """Create dataframe-compatible data structure"""
252
+ if not data:
253
+ return []
254
+
255
+ if columns:
256
+ headers = columns
257
+ else:
258
+ headers = list(data[0].keys()) if data else []
259
+
260
+ rows = [[row.get(h, "") for h in headers] for row in data]
261
+
262
+ return [headers] + rows
263
+
264
+
265
+ class CodeHighlighter:
266
+ """Code display and highlighting"""
267
+
268
+ @staticmethod
269
+ def create_code_display(code: str, language: str = "python") -> gr.Code:
270
+ """Create code display block"""
271
+ return gr.Code(
272
+ value=code,
273
+ language=language,
274
+ lines=20
275
+ )
276
+
277
+ @staticmethod
278
+ def create_copy_button(code: str) -> gr.Button:
279
+ """Create copy-to-clipboard button"""
280
+ return gr.Button("📋 Copy Code")
281
+
282
+ @staticmethod
283
+ def create_diff_view(old_code: str, new_code: str) -> Tuple[gr.Code, gr.Code]:
284
+ """Create side-by-side diff view"""
285
+ return (
286
+ gr.Code(value=old_code, language="python", lines=15, label="Before"),
287
+ gr.Code(value=new_code, language="python", lines=15, label="After")
288
+ )
289
+
290
+
291
+ def create_header(title: str, description: str, emoji: str = "🚀") -> gr.Markdown:
292
+ return SharedComponents.create_header(title, description, emoji)
293
+
294
+
295
+ def create_footer(version: str = "1.0.0") -> gr.Markdown:
296
+ return SharedComponents.create_footer(version)
297
+
298
+
299
+ def create_premium_hero(
300
+ title: str,
301
+ description: str,
302
+ emoji: str = "🚀",
303
+ badge: str = "Featured Space",
304
+ highlights: Optional[List[str]] = None,
305
+ ) -> gr.HTML:
306
+ return SharedComponents.create_premium_hero(title, description, emoji, badge, highlights)
307
+
308
+
309
+ def create_method_panel(items: Dict[str, str]) -> gr.HTML:
310
+ return SharedComponents.create_method_panel(items)
311
+
312
+
313
+ class ProgressTracker:
314
+ """Track multi-step progress"""
315
+
316
+ def __init__(self, steps: List[str]):
317
+ self.steps = steps
318
+ self.current = 0
319
+
320
+ def get_status(self) -> str:
321
+ """Get current status"""
322
+ completed = "✅ " + "\n".join(self.steps[:self.current])
323
+ current = f"🔄 {self.steps[self.current]}" if self.current < len(self.steps) else ""
324
+ remaining = "\n".join([f"⬜ {s}" for s in self.steps[self.current+1:]])
325
+
326
+ return f"""
327
+ ## Progress
328
+
329
+ {completed}
330
+ {current}
331
+ {remaining}
332
+ """
333
+
334
+ def advance(self) -> bool:
335
+ """Move to next step"""
336
+ if self.current < len(self.steps):
337
+ self.current += 1
338
+ return True
339
+ return False
340
+
341
+ def reset(self):
342
+ """Reset progress"""
343
+ self.current = 0
344
+
345
+
346
+ def create_tabbed_interface(tabs: Dict[str, Any]) -> gr.Blocks:
347
+ """Create tabbed interface helper"""
348
+ with gr.Blocks() as demo:
349
+ with gr.Tabs():
350
+ for tab_name, tab_content in tabs.items():
351
+ with gr.Tab(tab_name):
352
+ tab_content
353
+
354
+ return demo
355
+
356
+
357
+ def create_side_by_side(left_content: Any, right_content: Any) -> Tuple[gr.Column, gr.Column]:
358
+ """Create side-by-side layout"""
359
+ with gr.Row():
360
+ with gr.Column():
361
+ left_content
362
+ with gr.Column():
363
+ right_content
364
+
365
+ return left_content, right_content
366
+
367
+
368
+ def create_accordion(items: List[Tuple[str, Any]]) -> gr.Accordion:
369
+ """Create accordion-style expandable sections"""
370
+ with gr.Accordion("Click to expand") as accordion:
371
+ for title, content in items:
372
+ gr.Markdown(f"### {title}")
373
+ content
374
+
375
+ return accordion
shared/styles.css ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* HF-Master Shared Styles
2
+ Light research-studio system inspired by MCP Video Localizer. */
3
+
4
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');
5
+
6
+ :root {
7
+ --peach: #ffad7a;
8
+ --peach-dark: #e8935c;
9
+ --lavender: #b8a9d9;
10
+ --sky-blue: #7accff;
11
+ --ink: #1f2937;
12
+ --slate: #4b5563;
13
+ --muted: #6b7280;
14
+ --bg-light: #f9fafb;
15
+ --surface: #ffffff;
16
+ --surface-soft: #fff7f1;
17
+ --border-default: #e5e7eb;
18
+ --border-subtle: #f3f4f6;
19
+ --accent-subtle: rgba(255, 173, 122, 0.14);
20
+ --shadow-sm: 0 1px 2px rgba(31, 41, 55, 0.05);
21
+ --shadow-md: 0 8px 24px rgba(31, 41, 55, 0.08);
22
+ --shadow-lg: 0 18px 48px rgba(31, 41, 55, 0.12);
23
+ --radius: 8px;
24
+ }
25
+
26
+ html {
27
+ scroll-behavior: smooth;
28
+ }
29
+
30
+ body,
31
+ .gradio-container {
32
+ background:
33
+ radial-gradient(circle at 7% 4%, rgba(255, 173, 122, 0.22), transparent 28%),
34
+ radial-gradient(circle at 88% 8%, rgba(122, 204, 255, 0.20), transparent 30%),
35
+ linear-gradient(180deg, #ffffff 0%, var(--bg-light) 44%, #f7f2fb 100%) !important;
36
+ color: var(--ink) !important;
37
+ font-family: Inter, "Helvetica Neue", "Segoe UI", system-ui, -apple-system, sans-serif !important;
38
+ font-weight: 400;
39
+ letter-spacing: 0;
40
+ -webkit-font-smoothing: antialiased;
41
+ }
42
+
43
+ .gradio-container {
44
+ max-width: 1180px !important;
45
+ margin: 0 auto !important;
46
+ }
47
+
48
+ .main,
49
+ .block-container {
50
+ background: transparent !important;
51
+ }
52
+
53
+ .block-container {
54
+ max-width: 1180px;
55
+ padding-top: 2rem;
56
+ padding-bottom: 2rem;
57
+ }
58
+
59
+ /* Shared hero used by Gradio helpers and Streamlit HTML. */
60
+ .hf-hero,
61
+ .hero {
62
+ position: relative;
63
+ overflow: hidden;
64
+ background:
65
+ radial-gradient(circle at 20% 8%, rgba(255, 255, 255, 0.52), transparent 26%),
66
+ linear-gradient(135deg, var(--peach) 0%, var(--lavender) 54%, var(--sky-blue) 100%);
67
+ border: 1px solid rgba(255, 255, 255, 0.72);
68
+ border-radius: 16px;
69
+ box-shadow: var(--shadow-lg), 0 0 34px rgba(255, 173, 122, 0.18);
70
+ color: #ffffff;
71
+ margin: 1rem 0 1.25rem 0;
72
+ padding: clamp(1.25rem, 3vw, 2.25rem);
73
+ }
74
+
75
+ .hf-hero::after,
76
+ .hero::after {
77
+ content: "";
78
+ position: absolute;
79
+ inset: auto -12% -45% auto;
80
+ width: 360px;
81
+ height: 360px;
82
+ background: rgba(255, 255, 255, 0.22);
83
+ border-radius: 999px;
84
+ pointer-events: none;
85
+ }
86
+
87
+ .hf-hero-grid,
88
+ .hf-hero-copy {
89
+ position: relative;
90
+ z-index: 1;
91
+ }
92
+
93
+ .hf-hero-copy {
94
+ display: flex;
95
+ align-items: flex-start;
96
+ gap: 1rem;
97
+ }
98
+
99
+ .hf-icon {
100
+ align-items: center;
101
+ background: rgba(255, 255, 255, 0.24);
102
+ border: 1px solid rgba(255, 255, 255, 0.45);
103
+ border-radius: 8px;
104
+ box-shadow: var(--shadow-sm);
105
+ display: inline-flex;
106
+ flex: 0 0 auto;
107
+ font-size: 1.7rem;
108
+ height: 3.75rem;
109
+ justify-content: center;
110
+ width: 3.75rem;
111
+ }
112
+
113
+ .hf-badge {
114
+ background: rgba(255, 255, 255, 0.22);
115
+ border: 1px solid rgba(255, 255, 255, 0.42);
116
+ border-radius: 999px;
117
+ color: rgba(255, 255, 255, 0.96);
118
+ display: inline-flex;
119
+ font-size: 0.76rem;
120
+ font-weight: 800;
121
+ letter-spacing: 0.06em;
122
+ margin-bottom: 0.7rem;
123
+ padding: 0.34rem 0.7rem;
124
+ text-transform: uppercase;
125
+ }
126
+
127
+ .hf-hero h1,
128
+ .hero h1 {
129
+ color: #ffffff !important;
130
+ font-size: clamp(2rem, 4vw, 3.35rem);
131
+ font-weight: 800;
132
+ letter-spacing: 0;
133
+ line-height: 1.04;
134
+ margin: 0 0 0.45rem 0;
135
+ text-shadow: 0 2px 12px rgba(31, 41, 55, 0.18);
136
+ }
137
+
138
+ .hf-hero p,
139
+ .hero p {
140
+ color: rgba(255, 255, 255, 0.96) !important;
141
+ font-size: 1.03rem;
142
+ line-height: 1.65;
143
+ margin: 0;
144
+ max-width: 68ch;
145
+ }
146
+
147
+ .hf-chip-row,
148
+ .pill-row {
149
+ display: flex;
150
+ flex-wrap: wrap;
151
+ gap: 0.55rem;
152
+ margin-top: 1.15rem;
153
+ position: relative;
154
+ z-index: 1;
155
+ }
156
+
157
+ .hf-chip,
158
+ .badge,
159
+ .tech-tag {
160
+ background: rgba(255, 255, 255, 0.24);
161
+ border: 1px solid rgba(255, 255, 255, 0.45);
162
+ border-radius: 999px;
163
+ color: #ffffff;
164
+ display: inline-flex;
165
+ font-size: 0.86rem;
166
+ font-weight: 700;
167
+ padding: 0.42rem 0.75rem;
168
+ }
169
+
170
+ .hf-method-grid {
171
+ display: grid;
172
+ gap: 0.9rem;
173
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
174
+ margin: 1rem 0;
175
+ }
176
+
177
+ .hf-method-card,
178
+ .glass-card,
179
+ .project-card,
180
+ .info-card,
181
+ .metric-card,
182
+ .stat-box,
183
+ .gradio-container .form,
184
+ .gradio-container .panel {
185
+ background: rgba(255, 255, 255, 0.88) !important;
186
+ border: 1px solid var(--border-default) !important;
187
+ border-radius: var(--radius) !important;
188
+ box-shadow: var(--shadow-md) !important;
189
+ }
190
+
191
+ .hf-method-card {
192
+ padding: 1rem;
193
+ }
194
+
195
+ .hf-method-card span {
196
+ color: var(--peach-dark);
197
+ display: block;
198
+ font-size: 0.78rem;
199
+ font-weight: 800;
200
+ letter-spacing: 0.04em;
201
+ margin-bottom: 0.35rem;
202
+ text-transform: uppercase;
203
+ }
204
+
205
+ .hf-method-card p {
206
+ color: var(--slate);
207
+ line-height: 1.55;
208
+ margin: 0;
209
+ }
210
+
211
+ .project-card,
212
+ .info-card,
213
+ .metric-card,
214
+ .stat-box {
215
+ padding: 1rem;
216
+ }
217
+
218
+ h1,
219
+ h2,
220
+ h3,
221
+ h4,
222
+ .markdown-text h1,
223
+ .markdown-text h2,
224
+ .markdown-text h3 {
225
+ color: var(--ink) !important;
226
+ font-family: Inter, "Helvetica Neue", system-ui, sans-serif !important;
227
+ letter-spacing: 0;
228
+ }
229
+
230
+ p,
231
+ li,
232
+ label,
233
+ .markdown-text,
234
+ .markdown-text p,
235
+ .markdown-text span {
236
+ color: var(--slate) !important;
237
+ font-family: Inter, "Helvetica Neue", system-ui, sans-serif !important;
238
+ }
239
+
240
+ strong {
241
+ color: var(--ink);
242
+ font-weight: 700;
243
+ }
244
+
245
+ input,
246
+ select,
247
+ textarea,
248
+ .gr-textbox,
249
+ .gr-dropdown {
250
+ background: #ffffff !important;
251
+ border: 1px solid var(--border-default) !important;
252
+ border-radius: var(--radius) !important;
253
+ color: var(--ink) !important;
254
+ font-family: Inter, "Helvetica Neue", system-ui, sans-serif !important;
255
+ transition: border-color 0.15s ease, box-shadow 0.15s ease !important;
256
+ }
257
+
258
+ input:focus,
259
+ select:focus,
260
+ textarea:focus,
261
+ .gr-textbox:focus {
262
+ border-color: var(--peach) !important;
263
+ box-shadow: 0 0 0 3px var(--accent-subtle) !important;
264
+ outline: none !important;
265
+ }
266
+
267
+ button.primary,
268
+ button[class*="primary"],
269
+ div[data-testid="stButton"] > button {
270
+ background: linear-gradient(135deg, var(--peach) 0%, var(--peach-dark) 100%) !important;
271
+ border: 0 !important;
272
+ border-radius: var(--radius) !important;
273
+ box-shadow: 0 8px 20px rgba(255, 173, 122, 0.28) !important;
274
+ color: #ffffff !important;
275
+ font-family: Inter, "Helvetica Neue", system-ui, sans-serif !important;
276
+ font-weight: 800 !important;
277
+ padding: 0.72rem 1.1rem !important;
278
+ transition: transform 0.18s ease, box-shadow 0.18s ease !important;
279
+ }
280
+
281
+ button.primary:hover,
282
+ button[class*="primary"]:hover,
283
+ div[data-testid="stButton"] > button:hover {
284
+ box-shadow: 0 12px 26px rgba(255, 173, 122, 0.36) !important;
285
+ transform: translateY(-1px) !important;
286
+ }
287
+
288
+ button.secondary,
289
+ button[class*="secondary"] {
290
+ background: #ffffff !important;
291
+ border: 1px solid var(--border-default) !important;
292
+ border-radius: var(--radius) !important;
293
+ color: var(--ink) !important;
294
+ font-weight: 700 !important;
295
+ }
296
+
297
+ code,
298
+ pre {
299
+ border-radius: var(--radius) !important;
300
+ }
301
+
302
+ code {
303
+ background: #fff2e8 !important;
304
+ color: #9a4f1f !important;
305
+ }
306
+
307
+ pre {
308
+ background: #111827 !important;
309
+ border: 1px solid #273244 !important;
310
+ color: #f9fafb !important;
311
+ }
312
+
313
+ table {
314
+ border-collapse: collapse;
315
+ width: 100%;
316
+ }
317
+
318
+ th {
319
+ background: #fff2e8;
320
+ color: var(--ink);
321
+ font-weight: 800;
322
+ }
323
+
324
+ td,
325
+ th {
326
+ border-bottom: 1px solid var(--border-default);
327
+ padding: 0.7rem;
328
+ }
329
+
330
+ blockquote,
331
+ .markdown-text blockquote {
332
+ background: #faf9fc !important;
333
+ border-left: 3px solid var(--lavender) !important;
334
+ border-radius: 0 var(--radius) var(--radius) 0 !important;
335
+ color: var(--slate) !important;
336
+ margin: 0.5rem 0 !important;
337
+ padding: 0.75rem 1rem !important;
338
+ }
339
+
340
+ a {
341
+ color: #2774a9 !important;
342
+ font-weight: 700;
343
+ }
344
+
345
+ .hf-footer {
346
+ border-top: 1px solid var(--border-default);
347
+ color: var(--muted);
348
+ font-size: 0.92rem;
349
+ margin-top: 1.5rem;
350
+ padding: 1rem 0;
351
+ text-align: center;
352
+ }
353
+
354
+ .hf-footer strong {
355
+ color: var(--ink);
356
+ }
357
+
358
+ /* Streamlit shell polish. */
359
+ div[data-testid="stHeader"],
360
+ div[data-testid="stToolbar"] {
361
+ background: transparent !important;
362
+ }
363
+
364
+ div[data-testid="stSidebar"] {
365
+ background: rgba(255, 255, 255, 0.82) !important;
366
+ border-right: 1px solid var(--border-default);
367
+ }
368
+
369
+ div[data-baseweb="input"],
370
+ div[data-baseweb="textarea"],
371
+ div[data-baseweb="select"] {
372
+ background: #ffffff !important;
373
+ }
374
+
375
+ div[data-testid="stTextInput"] input,
376
+ div[data-testid="stTextArea"] textarea,
377
+ div[data-testid="stSelectbox"] div {
378
+ border-radius: var(--radius) !important;
379
+ border-color: var(--border-default) !important;
380
+ }
381
+
382
+ div[data-testid="stMetric"] {
383
+ background: rgba(255, 255, 255, 0.9);
384
+ border: 1px solid var(--border-default);
385
+ border-radius: var(--radius);
386
+ box-shadow: var(--shadow-md);
387
+ padding: 0.8rem 1rem;
388
+ }
389
+
390
+ .stPlotlyChart {
391
+ background: rgba(255, 255, 255, 0.86);
392
+ border: 1px solid var(--border-default);
393
+ border-radius: var(--radius);
394
+ box-shadow: var(--shadow-sm);
395
+ padding: 0.3rem;
396
+ }
397
+
398
+ @keyframes fadeIn {
399
+ from {
400
+ opacity: 0;
401
+ transform: translateY(8px);
402
+ }
403
+ to {
404
+ opacity: 1;
405
+ transform: translateY(0);
406
+ }
407
+ }
408
+
409
+ .hf-hero,
410
+ .hf-method-card,
411
+ .metric-card,
412
+ .info-card {
413
+ animation: fadeIn 0.28s ease-out;
414
+ }
415
+
416
+ @media (max-width: 720px) {
417
+ .hf-hero-copy {
418
+ flex-direction: column;
419
+ }
420
+
421
+ .hf-icon {
422
+ height: 3.2rem;
423
+ width: 3.2rem;
424
+ }
425
+ }
shared/utils.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HF-Master Shared Utilities
3
+ Helper functions for all projects
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import json
9
+ import hashlib
10
+ from typing import Dict, List, Optional, Any, Union
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ import sqlite3
14
+
15
+
16
+ def load_env(var_name: str, default: Optional[str] = None) -> Optional[str]:
17
+ """Load environment variable with optional default"""
18
+ return os.getenv(var_name, default)
19
+
20
+
21
+ def load_api_key(provider: str = "openai") -> Optional[str]:
22
+ """Load API key for specified provider"""
23
+ key_map = {
24
+ "openai": "OPENAI_API_KEY",
25
+ "anthropic": "ANTHROPIC_API_KEY",
26
+ "huggingface": "HF_TOKEN",
27
+ "cohere": "COHERE_API_KEY",
28
+ "together": "TOGETHER_API_KEY"
29
+ }
30
+
31
+ env_var = key_map.get(provider.lower())
32
+ if env_var:
33
+ return load_env(env_var)
34
+
35
+ return None
36
+
37
+
38
+ def estimate_token_count(text: str, model: str = "gpt-4") -> int:
39
+ """Estimate token count for text"""
40
+ tokens_per_word = {
41
+ "gpt-4": 4, # ~4 chars per token
42
+ "gpt-3.5": 4,
43
+ "claude": 4,
44
+ "llama": 3 # More efficient
45
+ }
46
+
47
+ chars_per_token = tokens_per_word.get(model, 4)
48
+ return len(text) // chars_per_token
49
+
50
+
51
+ def estimate_tokens(text: str, model: str = "gpt-4") -> int:
52
+ """Backward-compatible alias used by older apps"""
53
+ return estimate_token_count(text, model)
54
+
55
+
56
+ def calculate_api_cost(
57
+ model: str,
58
+ input_tokens: int,
59
+ output_tokens: int,
60
+ provider: str = "openai"
61
+ ) -> float:
62
+ """Calculate API cost for model usage"""
63
+
64
+ pricing = {
65
+ "openai": {
66
+ "gpt-4": {"input": 0.03, "output": 0.06},
67
+ "gpt-3.5-turbo": {"input": 0.001, "output": 0.002},
68
+ "gpt-4-turbo": {"input": 0.01, "output": 0.03}
69
+ },
70
+ "anthropic": {
71
+ "claude-3-opus": {"input": 0.015, "output": 0.075},
72
+ "claude-3-sonnet": {"input": 0.003, "output": 0.015}
73
+ }
74
+ }
75
+
76
+ provider_pricing = pricing.get(provider, {})
77
+ model_pricing = provider_pricing.get(model, {"input": 0.01, "output": 0.03})
78
+
79
+ input_cost = (input_tokens / 1000) * model_pricing["input"]
80
+ output_cost = (output_tokens / 1000) * model_pricing["output"]
81
+
82
+ return input_cost + output_cost
83
+
84
+
85
+ def calculate_cost(tokens: int, model: str = "gpt-4", provider: str = "openai") -> float:
86
+ """Backward-compatible alias used by older apps"""
87
+ return calculate_api_cost(model=model, input_tokens=tokens, output_tokens=0, provider=provider)
88
+
89
+
90
+ def sanitize_filename(name: str) -> str:
91
+ """Convert string to safe filename"""
92
+ name = name.lower().strip()
93
+ name = re.sub(r'[^\w\s-]', '', name)
94
+ name = re.sub(r'[\s]+', '-', name)
95
+ return name
96
+
97
+
98
+ def create_hash(text: str, length: int = 8) -> str:
99
+ """Create short hash from text"""
100
+ return hashlib.md5(text.encode()).hexdigest()[:length]
101
+
102
+
103
+ def format_duration(seconds: float) -> str:
104
+ """Format duration in human-readable form"""
105
+ if seconds < 60:
106
+ return f"{seconds:.1f}s"
107
+ elif seconds < 3600:
108
+ return f"{seconds/60:.1f}m"
109
+ else:
110
+ return f"{seconds/3600:.1f}h"
111
+
112
+
113
+ def format_bytes(bytes: int) -> str:
114
+ """Format bytes in human-readable form"""
115
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
116
+ if bytes < 1024:
117
+ return f"{bytes:.1f} {unit}"
118
+ bytes /= 1024
119
+ return f"{bytes:.1f} PB"
120
+
121
+
122
+ def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
123
+ """Truncate text with suffix"""
124
+ if len(text) <= max_length:
125
+ return text
126
+ return text[:max_length - len(suffix)] + suffix
127
+
128
+
129
+ def parse_dice_notation(notation: str) -> Dict[str, Any]:
130
+ """Parse dice notation like 2d6+3"""
131
+ match = re.match(r'(\d+)d(\d+)(kh\d+)?([+-]\d+)?', notation.upper())
132
+ if not match:
133
+ raise ValueError(f"Invalid dice notation: {notation}")
134
+
135
+ num_dice = int(match.group(1))
136
+ die_size = int(match.group(2))
137
+ keep_high = match.group(3)
138
+ modifier = int(match.group(4)) if match.group(4) else 0
139
+
140
+ return {
141
+ "num_dice": num_dice,
142
+ "die_size": die_size,
143
+ "keep_high": keep_high,
144
+ "modifier": modifier
145
+ }
146
+
147
+
148
+ def roll_dice(notation: str) -> List[int]:
149
+ """Roll dice and return individual rolls"""
150
+ import random
151
+
152
+ parsed = parse_dice_notation(notation)
153
+ rolls = [random.randint(1, parsed["die_size"]) for _ in range(parsed["num_dice"])]
154
+
155
+ if parsed["keep_high"]:
156
+ keep = int(parsed["keep_high"][2:])
157
+ rolls = sorted(rolls, reverse=True)[:keep]
158
+
159
+ return rolls
160
+
161
+
162
+ def calculate_modifier(ability_score: int) -> int:
163
+ """Calculate D&D ability modifier from score"""
164
+ return (ability_score - 10) // 2
165
+
166
+
167
+ def validate_ethereum_address(address: str) -> bool:
168
+ """Validate Ethereum address format"""
169
+ pattern = r'^0x[a-fA-F0-9]{40}$'
170
+ return bool(re.match(pattern, address))
171
+
172
+
173
+ def validate_solana_address(address: str) -> bool:
174
+ """Validate Solana address format"""
175
+ pattern = r'^[1-9A-HJ-NP-Za-km-z]{32,44}$'
176
+ return bool(re.match(pattern, address))
177
+
178
+
179
+ def extract_urls(text: str) -> List[str]:
180
+ """Extract URLs from text"""
181
+ url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
182
+ return re.findall(url_pattern, text)
183
+
184
+
185
+ def extract_code_blocks(text: str) -> List[str]:
186
+ """Extract code blocks from markdown text"""
187
+ pattern = r'```(?:\w+)?\n(.*?)```'
188
+ return re.findall(pattern, text, re.DOTALL)
189
+
190
+
191
+ def parse_math_expression(expr: str) -> float:
192
+ """Safely evaluate simple math expressions"""
193
+ allowed_chars = set("0123456789+-*/.() ")
194
+ if all(c in allowed_chars for c in expr):
195
+ return eval(expr)
196
+ raise ValueError(f"Unsafe expression: {expr}")
197
+
198
+
199
+ def create_timer(func):
200
+ """Decorator to time function execution"""
201
+ import time
202
+ from functools import wraps
203
+
204
+ @wraps(func)
205
+ def wrapper(*args, **kwargs):
206
+ start = time.time()
207
+ result = func(*args, **kwargs)
208
+ duration = time.time() - start
209
+ print(f"{func.__name__} took {format_duration(duration)}")
210
+ return result
211
+
212
+ return wrapper
213
+
214
+
215
+ def retry_on_failure(max_attempts: int = 3, delay: float = 1.0):
216
+ """Decorator to retry function on failure"""
217
+ from functools import wraps
218
+ import time
219
+
220
+ def decorator(func):
221
+ @wraps(func)
222
+ def wrapper(*args, **kwargs):
223
+ for attempt in range(max_attempts):
224
+ try:
225
+ return func(*args, **kwargs)
226
+ except Exception as e:
227
+ if attempt == max_attempts - 1:
228
+ raise
229
+ time.sleep(delay * (attempt + 1))
230
+
231
+ return wrapper
232
+
233
+ return decorator
234
+
235
+
236
+ class SimpleCache:
237
+ """Simple in-memory cache"""
238
+
239
+ def __init__(self, max_size: int = 100):
240
+ self.cache: Dict[str, Any] = {}
241
+ self.max_size = max_size
242
+ self.access_times: Dict[str, datetime] = {}
243
+
244
+ def get(self, key: str) -> Optional[Any]:
245
+ """Get value from cache"""
246
+ if key in self.cache:
247
+ self.access_times[key] = datetime.now()
248
+ return self.cache[key]
249
+ return None
250
+
251
+ def set(self, key: str, value: Any):
252
+ """Set value in cache"""
253
+ if len(self.cache) >= self.max_size:
254
+ oldest = min(self.access_times.items(), key=lambda x: x[1])[0]
255
+ del self.cache[oldest]
256
+ del self.access_times[oldest]
257
+
258
+ self.cache[key] = value
259
+ self.access_times[key] = datetime.now()
260
+
261
+ def clear(self):
262
+ """Clear cache"""
263
+ self.cache.clear()
264
+ self.access_times.clear()
265
+
266
+
267
+ class Database:
268
+ """Simple SQLite wrapper"""
269
+
270
+ def __init__(self, db_path: str = "data.db"):
271
+ self.db_path = db_path
272
+ Path(db_path).parent.mkdir(parents=True, exist_ok=True)
273
+ self.conn = None
274
+
275
+ def connect(self):
276
+ """Connect to database"""
277
+ self.conn = sqlite3.connect(self.db_path)
278
+ self.conn.row_factory = sqlite3.Row
279
+
280
+ def close(self):
281
+ """Close database connection"""
282
+ if self.conn:
283
+ self.conn.close()
284
+
285
+ def execute(self, query: str, params: tuple = ()) -> sqlite3.Cursor:
286
+ """Execute query"""
287
+ if not self.conn:
288
+ self.connect()
289
+ return self.conn.execute(query, params)
290
+
291
+ def commit(self):
292
+ """Commit transaction"""
293
+ if self.conn:
294
+ self.conn.commit()
295
+
296
+ def fetchall(self, query: str, params: tuple = ()) -> List[Dict]:
297
+ """Fetch all results"""
298
+ cursor = self.execute(query, params)
299
+ return [dict(row) for row in cursor.fetchall()]
300
+
301
+ def fetchone(self, query: str, params: tuple = ()) -> Optional[Dict]:
302
+ """Fetch one result"""
303
+ cursor = self.execute(query, params)
304
+ row = cursor.fetchone()
305
+ return dict(row) if row else None
306
+
307
+ def create_table(self, name: str, columns: Dict[str, str]):
308
+ """Create table with columns"""
309
+ cols = ", ".join([f"{k} {v}" for k, v in columns.items()])
310
+ self.execute(f"CREATE TABLE IF NOT EXISTS {name} ({cols})")
311
+ self.commit()
312
+
313
+
314
+ def load_json_file(filepath: str) -> Dict:
315
+ """Load JSON file"""
316
+ with open(filepath, 'r') as f:
317
+ return json.load(f)
318
+
319
+
320
+ def save_json_file(data: Dict, filepath: str):
321
+ """Save JSON file"""
322
+ Path(filepath).parent.mkdir(parents=True, exist_ok=True)
323
+ with open(filepath, 'w') as f:
324
+ json.dump(data, f, indent=2)
325
+
326
+
327
+ def merge_dicts(*dicts: Dict) -> Dict:
328
+ """Merge multiple dictionaries"""
329
+ result = {}
330
+ for d in dicts:
331
+ result.update(d)
332
+ return result
333
+
334
+
335
+ def flatten_list(nested: List[Any]) -> List[Any]:
336
+ """Flatten nested list"""
337
+ result = []
338
+ for item in nested:
339
+ if isinstance(item, list):
340
+ result.extend(flatten_list(item))
341
+ else:
342
+ result.append(item)
343
+ return result
344
+
345
+
346
+ def chunk_text(text: str, chunk_size: int, overlap: int = 0) -> List[str]:
347
+ """Split text into overlapping chunks"""
348
+ chunks = []
349
+ start = 0
350
+
351
+ while start < len(text):
352
+ end = start + chunk_size
353
+ chunks.append(text[start:end])
354
+ start = end - overlap
355
+
356
+ return chunks
357
+
358
+
359
+ def get_project_root() -> Path:
360
+ """Get project root directory"""
361
+ return Path(__file__).parent.parent
362
+
363
+
364
+ def ensure_dir(path: str):
365
+ """Ensure directory exists"""
366
+ Path(path).mkdir(parents=True, exist_ok=True)