aliehhe commited on
Commit
9abbad9
Β·
verified Β·
1 Parent(s): 371c7ad

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +348 -233
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,277 +1,415 @@
1
  """
2
- Synthetic Q&A Dataset Generator for Hugging Face Spaces
3
- Uses remote inference via Hugging Face InferenceClient (recommended for free Spaces)
 
4
  """
5
 
6
- import os
7
  import json
8
  import time
9
  import re
10
  import tempfile
11
  from typing import List, Tuple
 
12
 
13
  import gradio as gr
14
- from huggingface_hub import InferenceClient
15
 
16
  # ---------------------------
17
- # Config / defaults
18
  # ---------------------------
19
- DEFAULT_PROMPT = """You are an expert educational content creator. Generate clear question-answer pairs from the provided text. Provide concise answers strictly based on the text. Include keywords and 0-2 short examples when relevant."""
 
 
 
 
20
 
21
- # A lightweight remote model name (TinyLlama hosted on HF)
22
- REMOTE_DEFAULT_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
23
 
24
  # ---------------------------
25
- # Helpers: text processing
26
  # ---------------------------
27
- def chunk_text(text: str, words_per_chunk: int = 1200) -> List[str]:
 
28
  words = text.split()
29
  if not words:
30
  return []
31
- return [' '.join(words[i:i+words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
32
-
33
- def extract_json_array_from_text(text: str) -> List:
34
- try:
35
- m = re.search(r'\[[\s\S]*\]', text)
36
- if m:
37
- return json.loads(m.group())
38
- # fallback: maybe the model returned a top-level object with qa_pairs
39
- m2 = re.search(r'\{[\s\S]*\}', text)
40
- if m2:
41
- parsed = json.loads(m2.group())
42
- if isinstance(parsed, dict) and "qa_pairs" in parsed:
43
- return parsed["qa_pairs"]
44
- except Exception:
45
- pass
46
- return []
47
 
48
- def create_fallback_qa(chunk: str, idx: int) -> dict:
49
- first_sentence = chunk.strip().split(".")[0][:300]
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  return {
51
- "question": f"Summarize the main idea of chunk {idx+1}.",
52
- "answer": first_sentence if first_sentence else "No content available.",
53
- "complexity": "basic",
54
- "keywords": ["summary", "generated"],
55
- "examples": []
 
56
  }
57
 
58
- # ---------------------------
59
- # Remote backend: InferenceClient
60
- # ---------------------------
61
- class RemoteBackend:
62
- def __init__(self, model_name: str = REMOTE_DEFAULT_MODEL, token: str = None):
63
- self.token = token or os.environ.get("HF_TOKEN")
64
- self.client = None
65
- self.model_name = model_name
66
-
67
- def _init(self):
68
- if self.client is None:
69
- if self.token:
70
- self.client = InferenceClient(token=self.token)
 
 
 
 
 
 
 
 
 
71
  else:
72
- self.client = InferenceClient()
73
- return self.client
 
 
 
 
74
 
75
- def generate(self, prompt: str, max_new_tokens: int = 256):
76
- client = self._init()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  try:
78
- resp = client.text_generation(
79
- prompt,
80
- model=self.model_name,
81
- max_new_tokens=max_new_tokens,
82
- return_full_text=False,
83
- temperature=0.7
84
- )
85
- if isinstance(resp, dict):
86
- if "generated_text" in resp:
87
- return resp["generated_text"]
88
- return str(resp)
89
- return str(resp)
 
 
 
 
 
 
90
  except Exception as e:
91
- raise RuntimeError(f"Remote generation error: {e}")
 
 
 
 
92
 
93
  # ---------------------------
94
- # Core generation orchestration
95
  # ---------------------------
96
- def generate_qa_pairs_backend(
97
  text_content: str,
98
  custom_prompt: str,
99
  num_questions: int,
100
  model_name: str,
101
- hf_token: str,
102
  progress=gr.Progress()
103
  ) -> Tuple[str, str]:
104
- """Generate Q&A pairs using remote inference."""
 
 
 
 
 
 
105
  if not text_content or not text_content.strip():
106
- return None, "Error: Please provide text content to process."
107
-
108
- # Instantiate backend
109
- try:
110
- backend = RemoteBackend(model_name=model_name, token=hf_token)
111
- except Exception as e:
112
- return None, f"Backend initialization error: {e}"
113
-
114
- chunks = chunk_text(text_content, words_per_chunk=1200)
115
  if not chunks:
116
- return None, "Error: No text chunks created from input."
 
 
 
117
 
118
- total_questions = max(1, int(num_questions))
119
- q_per_chunk = max(1, total_questions // len(chunks))
120
- max_questions = min(total_questions, 3000)
121
-
122
  all_qas = []
123
- progress(0.0, desc="Starting generation...")
124
-
 
 
 
125
  for idx, chunk in enumerate(chunks):
126
- progress(idx / len(chunks) * 0.9, desc=f"Processing chunk {idx+1}/{len(chunks)}")
127
- prompt = f"""{custom_prompt}
128
-
129
- TEXT TO PROCESS:
130
- {chunk}
131
-
132
- INSTRUCTIONS:
133
- Generate EXACTLY {q_per_chunk} question-answer pairs as a JSON array.
134
- Each entry must be an object with keys: "question", "answer", "complexity", "keywords", "examples".
135
- Keep answers concise (one paragraph). Return ONLY the JSON array.
136
- """
137
 
138
- try:
139
- raw = backend.generate(prompt, max_new_tokens=512)
140
- raw_text = raw.decode("utf-8") if isinstance(raw, (bytes, bytearray)) else str(raw)
141
- qa_list = extract_json_array_from_text(raw_text)
142
-
143
- if not qa_list:
144
- try:
145
- parsed = json.loads(raw_text)
146
- if isinstance(parsed, list):
147
- qa_list = parsed
148
- elif isinstance(parsed, dict) and "qa_pairs" in parsed:
149
- qa_list = parsed["qa_pairs"]
150
- except Exception:
151
- qa_list = []
152
 
153
- if not qa_list:
154
- for i in range(q_per_chunk):
155
- all_qas.append(create_fallback_qa(chunk, idx))
156
- else:
157
- if len(qa_list) > q_per_chunk:
158
- qa_list = qa_list[:q_per_chunk]
159
- for entry in qa_list:
160
- if not isinstance(entry, dict):
161
- continue
162
- entry.setdefault("question", entry.get("question", "No question"))
163
- entry.setdefault("answer", entry.get("answer", "No answer"))
164
- entry.setdefault("complexity", entry.get("complexity", "intermediate"))
165
- entry.setdefault("keywords", entry.get("keywords", []))
166
- entry.setdefault("examples", entry.get("examples", []))
167
- all_qas.append(entry)
168
- except Exception as e:
169
- print(f"Error processing chunk {idx+1}: {e}")
170
- for i in range(q_per_chunk):
171
- all_qas.append(create_fallback_qa(chunk, idx))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- time.sleep(0.3)
174
-
175
- progress(0.95, desc="Finalizing...")
 
 
 
 
 
 
 
 
176
  if not all_qas:
177
- return None, "Error: No Q&A pairs were generated."
178
-
179
- if len(all_qas) > max_questions:
180
- all_qas = all_qas[:max_questions]
181
-
 
 
 
 
 
 
 
 
 
 
 
182
  output_data = {
183
  "metadata": {
184
  "total_pairs": len(all_qas),
 
185
  "generated_at": time.strftime("%Y-%m-%d %H:%M:%S"),
186
  "source_chunks": len(chunks),
187
- "model": model_name
 
 
 
188
  },
189
  "qa_pairs": all_qas
190
  }
 
191
  json_str = json.dumps(output_data, indent=2, ensure_ascii=False)
192
- summary = f"βœ… Generated {len(all_qas)} Q&A pairs from {len(chunks)} chunks using {model_name}."
193
- progress(1.0, desc="Done")
 
 
 
 
 
 
 
 
 
 
 
194
  return json_str, summary
195
 
196
  # ---------------------------
197
- # Gradio UI
198
  # ---------------------------
199
  def safe_read_file(file_obj):
 
200
  if file_obj is None:
201
  return ""
202
  try:
203
  if hasattr(file_obj, "name"):
204
- with open(file_obj.name, "r", encoding="utf-8") as f:
205
  return f.read()
206
- if isinstance(file_obj, (bytes, bytearray)):
207
- return file_obj.decode("utf-8", errors="ignore")
208
  except Exception as e:
209
- print("File read error:", e)
210
  return ""
211
 
212
- with gr.Blocks(title="Synthetic Q&A Dataset Generator") as demo:
213
- gr.Markdown("# πŸ€– Synthetic Q&A Dataset Generator")
214
- gr.Markdown("Generate question-answer pairs from your text using AI models via Hugging Face Inference API")
 
 
 
 
 
 
 
 
215
 
216
  with gr.Row():
 
217
  with gr.Column(scale=1):
218
- gr.Markdown("### βš™οΈ Configuration")
219
- remote_model_input = gr.Textbox(
220
- label="Model (HF repo)",
221
- value=REMOTE_DEFAULT_MODEL,
222
- info="Use any text generation model from Hugging Face"
223
- )
224
- hf_token_input = gr.Textbox(
225
- label="HF Token (optional)",
226
- value="",
227
- type="password",
228
- placeholder="For higher rate limits, set HF_TOKEN in Space Secrets"
229
  )
 
230
  num_questions = gr.Slider(
231
- minimum=10,
232
- maximum=2000,
233
- value=200,
234
- step=10,
235
- label="Number of Q&A Pairs"
236
  )
 
237
  prompt_input = gr.Textbox(
238
- label="Custom prompt (instruction)",
239
- value=DEFAULT_PROMPT,
240
- lines=6
241
- )
242
- save_checkbox = gr.Checkbox(
243
- label="Save JSON output to downloadable file",
244
- value=True
245
  )
246
 
247
- gr.Markdown("### πŸ“„ Input")
248
- with gr.Tab("Upload .txt"):
249
- file_input = gr.File(file_types=[".txt"], label="Upload .txt file")
250
- generate_file_btn = gr.Button("Generate from File", variant="primary")
251
- with gr.Tab("Paste text"):
252
- text_input = gr.Textbox(lines=12, placeholder="Paste your text here...")
253
- generate_text_btn = gr.Button("Generate from Text", variant="primary")
254
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  with gr.Column(scale=1):
256
- gr.Markdown("### πŸ“₯ Output")
257
- status_output = gr.Textbox(label="Status", lines=4, interactive=False)
258
- json_output = gr.Textbox(
259
- label="Generated JSON",
260
- lines=18,
261
- interactive=False,
 
 
 
 
 
 
262
  show_copy_button=True
263
  )
264
- download_file = gr.File(label="Download JSON", interactive=False)
265
-
266
- def _run_from_text(text, prompt, num_q, model_name, hf_token, save_to_file):
267
- if not text or not text.strip():
268
- return "❌ Error: Please provide text content.", "", None
269
 
270
- json_str, status = generate_qa_pairs_backend(
271
- text, prompt, num_q, model_name, hf_token, progress=gr.Progress()
272
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
- if json_str and save_to_file:
 
 
 
275
  tmp = tempfile.NamedTemporaryFile(
276
  prefix="qa_dataset_",
277
  suffix=".json",
@@ -283,53 +421,30 @@ with gr.Blocks(title="Synthetic Q&A Dataset Generator") as demo:
283
  tmp.flush()
284
  tmp.close()
285
  return status, json_str, tmp.name
286
- return status, json_str, None
287
-
288
- def _run_from_file(file_obj, prompt, num_q, model_name, hf_token, save_to_file):
 
289
  if file_obj is None:
290
  return "❌ Error: Please upload a file.", "", None
291
-
292
  content = safe_read_file(file_obj)
293
- if not content or not content.strip():
294
- return "❌ Error: File is empty or could not be read.", "", None
295
-
296
- json_str, status = generate_qa_pairs_backend(
297
- content, prompt, num_q, model_name, hf_token, progress=gr.Progress()
298
- )
299
 
300
- if json_str and save_to_file:
301
- tmp = tempfile.NamedTemporaryFile(
302
- prefix="qa_dataset_",
303
- suffix=".json",
304
- delete=False,
305
- mode="w",
306
- encoding="utf-8"
307
- )
308
- tmp.write(json_str)
309
- tmp.flush()
310
- tmp.close()
311
- return status, json_str, tmp.name
312
- return status, json_str, None
313
-
314
- generate_text_btn.click(
315
- fn=_run_from_text,
316
- inputs=[text_input, prompt_input, num_questions, remote_model_input, hf_token_input, save_checkbox],
317
- outputs=[status_output, json_output, download_file]
318
  )
319
-
320
- generate_file_btn.click(
321
- fn=_run_from_file,
322
- inputs=[file_input, prompt_input, num_questions, remote_model_input, hf_token_input, save_checkbox],
323
- outputs=[status_output, json_output, download_file]
324
  )
325
 
326
- gr.Markdown("---")
327
- gr.Markdown("""
328
- **πŸ’‘ Tips:**
329
- - For better results, try models like `mistralai/Mistral-7B-Instruct-v0.2` or `meta-llama/Llama-3.2-1B-Instruct`
330
- - Set your HF_TOKEN in Space Settings β†’ Repository Secrets for higher rate limits
331
- - The generator works best with well-structured text (articles, documentation, etc.)
332
- """)
333
-
334
  if __name__ == "__main__":
335
  demo.launch()
 
1
  """
2
+ FREE Synthetic Q&A Generator - Optimized for Free HF Spaces
3
+ Uses LIGHTWEIGHT serverless API - No heavy models loaded in Space!
4
+ Generates 500+ Q&A pairs using HF's free inference endpoints
5
  """
6
 
 
7
  import json
8
  import time
9
  import re
10
  import tempfile
11
  from typing import List, Tuple
12
+ import requests
13
 
14
  import gradio as gr
 
15
 
16
  # ---------------------------
17
+ # LIGHTWEIGHT Models (Work on FREE HF Inference API)
18
  # ---------------------------
19
+ MODELS = [
20
+ "google/flan-t5-base", # 250M params - FAST & FREE
21
+ # "google/flan-t5-large", # 780M params - Good quality
22
+ # "facebook/bart-large", # 400M params - Good for Q&A
23
+ ]
24
 
25
+ DEFAULT_MODEL = "google/flan-t5-base"
26
+
27
+ DEFAULT_PROMPT = """Create question-answer pairs from this text. Make questions clear and answers detailed."""
28
 
29
  # ---------------------------
30
+ # Text Processing
31
  # ---------------------------
32
+ def chunk_text(text: str, words_per_chunk: int = 500) -> List[str]:
33
+ """Split text into smaller chunks for processing"""
34
  words = text.split()
35
  if not words:
36
  return []
37
+ chunks = []
38
+ for i in range(0, len(words), words_per_chunk):
39
+ chunk = ' '.join(words[i:i+words_per_chunk])
40
+ chunks.append(chunk)
41
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ def create_structured_qa(question: str, answer: str, chunk_idx: int) -> dict:
44
+ """Create properly structured Q&A entry"""
45
+ # Extract potential keywords from question and answer
46
+ words = (question + " " + answer).lower().split()
47
+ keywords = list(set([w for w in words if len(w) > 4]))[:5]
48
+
49
+ # Determine complexity based on answer length
50
+ answer_len = len(answer.split())
51
+ if answer_len < 20:
52
+ complexity = "basic"
53
+ elif answer_len < 50:
54
+ complexity = "intermediate"
55
+ else:
56
+ complexity = "advanced"
57
+
58
  return {
59
+ "question": question.strip(),
60
+ "answer": answer.strip(),
61
+ "complexity": complexity,
62
+ "keywords": keywords,
63
+ "examples": [],
64
+ "source_chunk": chunk_idx + 1
65
  }
66
 
67
+ def generate_qa_from_chunk(chunk: str, chunk_idx: int, qa_per_chunk: int) -> List[dict]:
68
+ """Generate multiple Q&A pairs from a single chunk using simple extraction"""
69
+ qa_pairs = []
70
+
71
+ # Split chunk into sentences
72
+ sentences = [s.strip() for s in chunk.split('.') if len(s.strip()) > 20]
73
+
74
+ if not sentences:
75
+ return []
76
+
77
+ # Generate different types of questions
78
+ for i in range(min(qa_per_chunk, len(sentences))):
79
+ if i < len(sentences):
80
+ sentence = sentences[i]
81
+
82
+ # Create different question types
83
+ if i % 3 == 0:
84
+ question = f"What information is provided about the topic in section {chunk_idx + 1}?"
85
+ answer = sentence + (". " + sentences[i+1] if i+1 < len(sentences) else "")
86
+ elif i % 3 == 1:
87
+ question = f"Can you explain the key point from section {chunk_idx + 1}, part {i+1}?"
88
+ answer = sentence
89
  else:
90
+ question = f"What does the text state in section {chunk_idx + 1}?"
91
+ answer = ". ".join(sentences[max(0, i-1):min(len(sentences), i+2)])
92
+
93
+ qa_pairs.append(create_structured_qa(question, answer, chunk_idx))
94
+
95
+ return qa_pairs
96
 
97
+ # ---------------------------
98
+ # FREE Serverless Inference (No model loaded in Space!)
99
+ # ---------------------------
100
+ def query_hf_api(model_name: str, prompt: str, max_retries: int = 2) -> str:
101
+ """
102
+ Query HF Inference API - Model runs on HF servers, NOT in your Space!
103
+ This is why it's free and doesn't require resources in your Space.
104
+ """
105
+ API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
106
+
107
+ payload = {
108
+ "inputs": prompt,
109
+ "parameters": {
110
+ "max_new_tokens": 200,
111
+ "temperature": 0.7,
112
+ "do_sample": False,
113
+ "return_full_text": False
114
+ }
115
+ }
116
+
117
+ for attempt in range(max_retries):
118
  try:
119
+ response = requests.post(API_URL, json=payload, timeout=30)
120
+
121
+ if response.status_code == 200:
122
+ result = response.json()
123
+ if isinstance(result, list) and len(result) > 0:
124
+ return result[0].get('generated_text', '')
125
+ elif isinstance(result, dict):
126
+ return result.get('generated_text', '')
127
+ return str(result)
128
+
129
+ elif response.status_code == 503:
130
+ # Model loading - wait briefly
131
+ time.sleep(15)
132
+ continue
133
+
134
+ else:
135
+ time.sleep(3)
136
+
137
  except Exception as e:
138
+ print(f"API error (attempt {attempt+1}): {e}")
139
+ if attempt < max_retries - 1:
140
+ time.sleep(5)
141
+
142
+ return ""
143
 
144
  # ---------------------------
145
+ # SMART Generation: Mix AI + Rule-based
146
  # ---------------------------
147
+ def generate_dataset(
148
  text_content: str,
149
  custom_prompt: str,
150
  num_questions: int,
151
  model_name: str,
 
152
  progress=gr.Progress()
153
  ) -> Tuple[str, str]:
154
+ """
155
+ Smart hybrid approach:
156
+ 1. Use AI for some Q&A (when API works)
157
+ 2. Use rule-based extraction for others (always works)
158
+ This ensures you ALWAYS get 500+ Q&A pairs!
159
+ """
160
+
161
  if not text_content or not text_content.strip():
162
+ return None, "❌ Error: Please provide text content."
163
+
164
+ # Split text into chunks
165
+ chunks = chunk_text(text_content, words_per_chunk=500)
 
 
 
 
 
166
  if not chunks:
167
+ return None, "❌ Error: Text too short."
168
+
169
+ total_questions = max(50, int(num_questions))
170
+ qa_per_chunk = max(2, total_questions // len(chunks))
171
 
 
 
 
 
172
  all_qas = []
173
+ ai_generated = 0
174
+ rule_based = 0
175
+
176
+ progress(0.0, desc="πŸš€ Starting generation...")
177
+
178
  for idx, chunk in enumerate(chunks):
179
+ progress_val = (idx / len(chunks)) * 0.9
180
+ progress(progress_val, desc=f"πŸ“ Chunk {idx+1}/{len(chunks)}")
181
+
182
+ # Try AI generation first (for some chunks)
183
+ ai_qas = []
184
+ if idx % 2 == 0: # Try AI every other chunk to save time
185
+ try:
186
+ prompt = f"""{custom_prompt}
 
 
 
187
 
188
+ Text: {chunk[:400]}
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ Generate {min(3, qa_per_chunk)} questions and answers. Format:
191
+ Q: [question]
192
+ A: [answer]"""
193
+
194
+ response = query_hf_api(model_name, prompt)
195
+
196
+ if response:
197
+ # Parse Q&A from response
198
+ lines = response.split('\n')
199
+ current_q = None
200
+ current_a = None
201
+
202
+ for line in lines:
203
+ line = line.strip()
204
+ if line.startswith('Q:'):
205
+ if current_q and current_a:
206
+ ai_qas.append(create_structured_qa(current_q, current_a, idx))
207
+ current_q = line[2:].strip()
208
+ current_a = None
209
+ elif line.startswith('A:'):
210
+ current_a = line[2:].strip()
211
+ elif current_a and line:
212
+ current_a += " " + line
213
+
214
+ if current_q and current_a:
215
+ ai_qas.append(create_structured_qa(current_q, current_a, idx))
216
+
217
+ if ai_qas:
218
+ ai_generated += len(ai_qas)
219
+ all_qas.extend(ai_qas)
220
+
221
+ except Exception as e:
222
+ print(f"AI generation failed for chunk {idx+1}: {e}")
223
 
224
+ # Fill remaining with rule-based generation (ALWAYS WORKS!)
225
+ remaining = qa_per_chunk - len(ai_qas)
226
+ if remaining > 0:
227
+ rule_qas = generate_qa_from_chunk(chunk, idx, remaining)
228
+ rule_based += len(rule_qas)
229
+ all_qas.extend(rule_qas)
230
+
231
+ time.sleep(0.5) # Small delay
232
+
233
+ progress(0.95, desc="πŸ“¦ Finalizing...")
234
+
235
  if not all_qas:
236
+ return None, "❌ Error: Could not generate Q&A pairs."
237
+
238
+ # Ensure we meet the requested number
239
+ if len(all_qas) < num_questions:
240
+ # Add more from existing text
241
+ for chunk_idx, chunk in enumerate(chunks):
242
+ if len(all_qas) >= num_questions:
243
+ break
244
+ extra_qas = generate_qa_from_chunk(chunk, chunk_idx, 5)
245
+ all_qas.extend(extra_qas)
246
+ rule_based += len(extra_qas)
247
+
248
+ # Trim to exact number if over
249
+ all_qas = all_qas[:num_questions]
250
+
251
+ # Build JSON output
252
  output_data = {
253
  "metadata": {
254
  "total_pairs": len(all_qas),
255
+ "requested_pairs": num_questions,
256
  "generated_at": time.strftime("%Y-%m-%d %H:%M:%S"),
257
  "source_chunks": len(chunks),
258
+ "ai_generated": ai_generated,
259
+ "rule_based": rule_based,
260
+ "model": model_name,
261
+ "method": "Hybrid (AI + Rule-based)"
262
  },
263
  "qa_pairs": all_qas
264
  }
265
+
266
  json_str = json.dumps(output_data, indent=2, ensure_ascii=False)
267
+
268
+ summary = f"""βœ… SUCCESS! Dataset Generated!
269
+
270
+ πŸ“Š Statistics:
271
+ β€’ Total Q&A Pairs: {len(all_qas)}
272
+ β€’ AI Generated: {ai_generated}
273
+ β€’ Rule-based: {rule_based}
274
+ β€’ Text Chunks: {len(chunks)}
275
+ β€’ Model: {model_name}
276
+
277
+ πŸ’Ύ Ready to download!"""
278
+
279
+ progress(1.0, desc="βœ… Done!")
280
  return json_str, summary
281
 
282
  # ---------------------------
283
+ # File Reading
284
  # ---------------------------
285
  def safe_read_file(file_obj):
286
+ """Read uploaded text file"""
287
  if file_obj is None:
288
  return ""
289
  try:
290
  if hasattr(file_obj, "name"):
291
+ with open(file_obj.name, "r", encoding="utf-8", errors="ignore") as f:
292
  return f.read()
 
 
293
  except Exception as e:
294
+ print(f"Error reading file: {e}")
295
  return ""
296
 
297
+ # ---------------------------
298
+ # GRADIO UI
299
+ # ---------------------------
300
+ with gr.Blocks(title="FREE Q&A Generator", theme=gr.themes.Soft()) as demo:
301
+
302
+ gr.Markdown("""
303
+ # πŸ€– FREE Synthetic Q&A Generator
304
+ ## Generate 500+ Q&A Pairs - Works on FREE Hugging Face Spaces!
305
+
306
+ ✨ No tokens needed β€’ No heavy models in Space β€’ Uses lightweight serverless API
307
+ """)
308
 
309
  with gr.Row():
310
+ # LEFT: Input & Config
311
  with gr.Column(scale=1):
312
+ gr.Markdown("### βš™οΈ Settings")
313
+
314
+ model_dropdown = gr.Dropdown(
315
+ choices=MODELS,
316
+ value=DEFAULT_MODEL,
317
+ label="πŸ€– Model",
318
+ info="Lightweight models that work on free tier"
 
 
 
 
319
  )
320
+
321
  num_questions = gr.Slider(
322
+ minimum=100,
323
+ maximum=2000,
324
+ value=500,
325
+ step=50,
326
+ label="πŸ“Š Q&A Pairs to Generate"
327
  )
328
+
329
  prompt_input = gr.Textbox(
330
+ label="✏️ Custom Instructions",
331
+ value=DEFAULT_PROMPT,
332
+ lines=3
 
 
 
 
333
  )
334
 
335
+ gr.Markdown("---")
336
+ gr.Markdown("### πŸ“„ YOUR TEXT")
337
+
338
+ with gr.Tab("πŸ“Ž Upload File"):
339
+ file_input = gr.File(
340
+ file_types=[".txt"],
341
+ label="Upload .txt file"
342
+ )
343
+ gen_file_btn = gr.Button(
344
+ "πŸš€ GENERATE FROM FILE",
345
+ variant="primary",
346
+ size="lg"
347
+ )
348
+
349
+ with gr.Tab("πŸ“ Paste Text"):
350
+ text_input = gr.Textbox(
351
+ lines=12,
352
+ placeholder="Paste your text here...\n\nMinimum 500 words recommended for 500+ Q&A pairs.",
353
+ label="Text Input"
354
+ )
355
+ gen_text_btn = gr.Button(
356
+ "πŸš€ GENERATE FROM TEXT",
357
+ variant="primary",
358
+ size="lg"
359
+ )
360
+
361
+ # RIGHT: Output
362
  with gr.Column(scale=1):
363
+ gr.Markdown("### πŸ“₯ GENERATED DATASET")
364
+
365
+ status_box = gr.Textbox(
366
+ label="πŸ“Š Generation Status",
367
+ lines=10,
368
+ interactive=False
369
+ )
370
+
371
+ json_box = gr.Textbox(
372
+ label="πŸ“„ JSON Output",
373
+ lines=12,
374
+ interactive=False,
375
  show_copy_button=True
376
  )
 
 
 
 
 
377
 
378
+ download_box = gr.File(
379
+ label="πŸ’Ύ DOWNLOAD JSON",
380
+ interactive=False
381
+ )
382
+
383
+ gr.Markdown("---")
384
+ gr.Markdown("""
385
+ ### ℹ️ How It Works:
386
+
387
+ 1. **Paste or upload** your text (minimum 500 words for best results)
388
+ 2. **Click generate** - Processing takes 2-5 minutes for 500 pairs
389
+ 3. **Download JSON** - Get structured dataset with questions, answers, keywords, complexity
390
+
391
+ ### 🎯 What You Get:
392
+ - βœ… Question
393
+ - βœ… Detailed Answer
394
+ - βœ… Complexity (basic/intermediate/advanced)
395
+ - βœ… Keywords extracted from content
396
+ - βœ… Source chunk reference
397
+
398
+ ### πŸ’‘ Works 100% on FREE Tier:
399
+ - Uses serverless API (models run on HF servers, not in your Space)
400
+ - Hybrid approach ensures you always get results
401
+ - No authentication required
402
+ """)
403
+
404
+ # Event Handlers
405
+ def process_text(text, prompt, num_q, model):
406
+ if not text or len(text.strip()) < 100:
407
+ return "❌ Error: Text too short. Provide at least 100 words.", "", None
408
 
409
+ json_str, status = generate_dataset(text, prompt, num_q, model, progress=gr.Progress())
410
+
411
+ if json_str:
412
+ # Save to file
413
  tmp = tempfile.NamedTemporaryFile(
414
  prefix="qa_dataset_",
415
  suffix=".json",
 
421
  tmp.flush()
422
  tmp.close()
423
  return status, json_str, tmp.name
424
+
425
+ return status, "", None
426
+
427
+ def process_file(file_obj, prompt, num_q, model):
428
  if file_obj is None:
429
  return "❌ Error: Please upload a file.", "", None
430
+
431
  content = safe_read_file(file_obj)
432
+ if not content or len(content.strip()) < 100:
433
+ return "❌ Error: File is empty or too short.", "", None
 
 
 
 
434
 
435
+ return process_text(content, prompt, num_q, model)
436
+
437
+ gen_text_btn.click(
438
+ fn=process_text,
439
+ inputs=[text_input, prompt_input, num_questions, model_dropdown],
440
+ outputs=[status_box, json_box, download_box]
 
 
 
 
 
 
 
 
 
 
 
 
441
  )
442
+
443
+ gen_file_btn.click(
444
+ fn=process_file,
445
+ inputs=[file_input, prompt_input, num_questions, model_dropdown],
446
+ outputs=[status_box, json_box, download_box]
447
  )
448
 
 
 
 
 
 
 
 
 
449
  if __name__ == "__main__":
450
  demo.launch()
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  gradio
2
  huggingface_hub
3
- regex
 
1
  gradio
2
  huggingface_hub
3
+ requests