Caden Shokat commited on
Commit
14c03a7
·
1 Parent(s): 49a5af2

processing/chunking updates

Browse files
src/processing/generate_qas.py CHANGED
@@ -1,70 +1,103 @@
1
- import os, json, glob, time
2
  from typing import List, Dict
3
  from dotenv import load_dotenv
 
4
  from load_chunks import load_all_chunks
5
  from openai import OpenAI
6
 
7
  openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
8
  NUM_QUESTIONS = 4
9
  SLEEP = 5
10
- MODEL = "gpt-4o-mini"
11
 
12
  def make_prompt(chunk_text: str) -> str:
13
  return f"""
14
- You are creating reading-comprehension questions.
15
- Given only the text below, generate exactly {NUM_QUESTIONS} short, factual questions
16
- that can be answered *only* from that text.
17
- Return your output as a JSON array of objects with fields:
18
- - id: integer question number (1..{NUM_QUESTIONS})
19
- - question: the question text
20
 
21
- Text:
22
- \"\"\"
23
- {chunk_text}
24
- \"\"\"
25
- """
26
-
27
- def generate(prompt: str) -> List[Dict]:
28
  while True:
29
  try:
30
  resp = openai.chat.completions.create(
31
- model=MODEL,
32
  messages=[
33
- { "role": "system", "content": "You output JSON only, no extra text." },
 
 
 
 
 
 
 
 
 
34
  { "role": "user", "content": prompt }
35
  ],
36
- temperature=0.7,
37
- max_tokens=NUM_QUESTIONS * 50
38
  )
39
  text = resp.choices[0].message.content.strip()
40
- return json.loads(text)
 
 
 
 
 
 
 
 
 
 
 
 
41
  except json.JSONDecodeError as e:
42
- print("Failted to parse JSON, retrying...", e)
43
  time.sleep(1)
44
 
45
  def main():
46
- chunks = load_all_chunks("../../../dataset/chunks/*.json")
47
-
48
- output_records = []
49
- for chunk in chunks:
50
- qas = generate(make_prompt(chunk["chunk"]))
 
 
 
 
 
 
 
51
 
52
- for qa in qas:
53
- output_records.append({
54
- "doc_id": chunk.get("doc_id"),
55
- "section": chunk.get("section"),
56
- "chunk_id": chunk["chunk_id"],
57
- "question_id": qa["id"],
58
- "question": qa["question"]
59
- })
60
 
61
- time.sleep(0.5)
 
62
 
63
- with open("synthetic_qa_pairs.jsonl", "w", encoding="utf-8") as out:
64
- for rec in output_records:
65
- out.write(json.dumps(rec, ensure_ascii=False) + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- print(f"Done — generated {len(output_records)} questions across {len(chunks)} chunks.")
68
 
69
  if __name__ == "__main__":
70
  main()
 
1
+ import os, json, glob, time, re
2
  from typing import List, Dict
3
  from dotenv import load_dotenv
4
+ import argparse
5
  from load_chunks import load_all_chunks
6
  from openai import OpenAI
7
 
8
  openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
9
  NUM_QUESTIONS = 4
10
  SLEEP = 5
11
+ model = "gpt-4o-mini"
12
 
13
  def make_prompt(chunk_text: str) -> str:
14
  return f"""
15
+ Generate according to the above rules. Return **only** json. **All** string fields must be valid JSON strings wrapped in double quotes.
16
+
17
+ Here is the text chunk:\n\n\"\"\"\n{chunk_text}\n\"\"\"\n\n
18
+ """
 
 
19
 
20
+ def generate(model: str, prompt: str) -> List[Dict]:
 
 
 
 
 
 
21
  while True:
22
  try:
23
  resp = openai.chat.completions.create(
24
+ model=model,
25
  messages=[
26
+ { "role": "system", "content":
27
+ """
28
+ You are an expert at generating reading-comprehension questions in **strict JSON** form.
29
+ Given the user’s chunk, you will output **only** a JSON array of objects—no commentary, no extra text.
30
+ Each object must have:
31
+ - question : the question text
32
+ - answer_span : the exact sentence from the chunk that answers this question
33
+ Output exactly 4 questions.
34
+ """
35
+ },
36
  { "role": "user", "content": prompt }
37
  ],
38
+ temperature=0.2,
39
+ max_tokens=NUM_QUESTIONS * 100
40
  )
41
  text = resp.choices[0].message.content.strip()
42
+ print(text)
43
+ raw = text
44
+ raw = re.sub(r"^```(?:json)?\s*", "", raw)
45
+ raw = re.sub(r"```$", "", raw).strip()
46
+
47
+ m = re.search(r"\[.*\]", raw, flags=re.S)
48
+ if m:
49
+ raw = m.group(0)
50
+
51
+ arr = json.loads(raw)
52
+
53
+ print(arr)
54
+ return arr
55
  except json.JSONDecodeError as e:
56
+ print("Failed to parse JSON, retrying...", e)
57
  time.sleep(1)
58
 
59
  def main():
60
+ parser = argparse.ArgumentParser(
61
+ description="Generate QA pairs from chunk JSON files via GPT-4"
62
+ )
63
+ parser.add_argument("chunks_glob",
64
+ help="Glob pattern for chunk JSON files (e.g. 'chunks/**/*.json')")
65
+ parser.add_argument("output",
66
+ help="Output JSONL file for QA pairs")
67
+ parser.add_argument("--model", default=model,
68
+ help="OpenAI model to use (default: gpt-4)")
69
+ parser.add_argument("--sleep", type=float, default=0.5,
70
+ help="Seconds to sleep between requests (default: 0.5)")
71
+ args = parser.parse_args()
72
 
73
+ openai.api_key = os.getenv("OPENAI_API_KEY")
74
+ if not openai.api_key:
75
+ parser.error("Please set OPENAI_API_KEY environment variable")
 
 
 
 
 
76
 
77
+ chunks = load_all_chunks(args.chunks_glob)
78
+ print(f"Loaded {len(chunks)} chunks.")
79
 
80
+ with open(args.output, "w", encoding="utf-8") as out_f:
81
+ total = 0
82
+ for rec in chunks:
83
+ qas = generate(args.model, make_prompt(rec["text"]))
84
+ i = 0
85
+ for qa in qas:
86
+ i += 1
87
+ out = {
88
+ "global_id": total,
89
+ "doc_id": rec["doc_id"],
90
+ "chunk_id": rec["chunk_id"],
91
+ "question_id": i,
92
+ "question": qa["question"],
93
+ "answer_span": qa["answer_span"],
94
+ "chunk": rec.get('text')
95
+ }
96
+ out_f.write(json.dumps(out, ensure_ascii=False) + "\n")
97
+ total += 1
98
+ time.sleep(args.sleep)
99
 
100
+ print(f"Done — generated {total} questions across {len(chunks)} chunks into '{args.output}'.")
101
 
102
  if __name__ == "__main__":
103
  main()
src/processing/load_chunks.py CHANGED
@@ -2,8 +2,12 @@ from typing import List, Dict
2
  import json, glob
3
 
4
  def load_all_chunks(glob_pattern: str) -> List[Dict]:
5
- all_chunks = []
6
  for path in glob.glob(glob_pattern, recursive=True):
7
- data = json.load(open(path, encoding="utf-8"))
8
- all_chunks.extend(data)
9
- return all_chunks
 
 
 
 
 
2
  import json, glob
3
 
4
  def load_all_chunks(glob_pattern: str) -> List[Dict]:
5
+ chunks = []
6
  for path in glob.glob(glob_pattern, recursive=True):
7
+ data = json.load(open(path, 'r', encoding='utf-8'))
8
+ for rec in data:
9
+ if "doc_id" not in rec or "chunk_id" not in rec or "text" not in rec:
10
+ raise ValueError(f"Missing required keys in chunk record: {rec}")
11
+ chunks.append(rec)
12
+ return chunks
13
+
src/processing/output.jsonl ADDED
The diff for this file is too large to render. See raw diff