Lukeam commited on
Commit
1bf5b03
·
verified ·
1 Parent(s): 3169eee

Upload qanda_gen_script.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. qanda_gen_script.py +156 -0
qanda_gen_script.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from datetime import datetime
4
+ import hashlib
5
+
6
+ class QAGenerator:
7
+ def __init__(self):
8
+ # Fix paths to stay within aa_book directory
9
+ self.base_dir = Path(__file__).parent # current directory (aa_book)
10
+ self.output_dir = self.base_dir / 'processed_data'
11
+ self.qa_dir = self.output_dir / 'qa_pairs'
12
+ self.raw_dir = self.output_dir / 'raw_extractions'
13
+ print(f"Looking for raw extractions in: {self.raw_dir}")
14
+ self.qa_dir.mkdir(parents=True, exist_ok=True)
15
+ self.manifest = []
16
+
17
+ def add_to_manifest(self, input_file, output_file, process_type, metadata):
18
+ """Track transformations in manifest"""
19
+ manifest_entry = {
20
+ 'timestamp': datetime.now().isoformat(),
21
+ 'input_file': str(input_file),
22
+ 'output_file': str(output_file),
23
+ 'process_type': process_type,
24
+ 'metadata': metadata
25
+ }
26
+ self.manifest.append(manifest_entry)
27
+
28
+ def generate_qa_pairs(self, text, source_info):
29
+ """Generate Q&A pairs from text"""
30
+ qa_pairs = []
31
+
32
+ # Split into sections (chapters, paragraphs)
33
+ sections = text.split('\n\n')
34
+
35
+ for i, section in enumerate(sections):
36
+ if len(section.strip()) < 100: # Skip short sections
37
+ continue
38
+
39
+ # Generate different types of questions
40
+ qa_pairs.extend([
41
+ {
42
+ 'question': f"What are the main points discussed in this section of {source_info['title']}?",
43
+ 'answer': section.strip(),
44
+ 'source': source_info,
45
+ 'section_index': i,
46
+ 'qa_type': 'main_points',
47
+ 'timestamp': datetime.now().isoformat()
48
+ },
49
+ {
50
+ 'question': f"Can you summarize the key concepts from this passage in {source_info['title']}?",
51
+ 'answer': section.strip(),
52
+ 'source': source_info,
53
+ 'section_index': i,
54
+ 'qa_type': 'summary',
55
+ 'timestamp': datetime.now().isoformat()
56
+ }
57
+ ])
58
+
59
+ # Add specific AA-related questions if relevant keywords are found
60
+ if any(word in section.lower() for word in ['step', 'tradition', 'recovery', 'sobriety']):
61
+ qa_pairs.append({
62
+ 'question': f"What recovery principles or concepts are discussed in this section of {source_info['title']}?",
63
+ 'answer': section.strip(),
64
+ 'source': source_info,
65
+ 'section_index': i,
66
+ 'qa_type': 'aa_specific',
67
+ 'timestamp': datetime.now().isoformat()
68
+ })
69
+
70
+ return qa_pairs
71
+
72
+ def process_all_sources(self):
73
+ """Process all extracted texts into QA pairs"""
74
+ # Update path to look in the correct location
75
+ raw_dir = self.output_dir / 'raw_extractions'
76
+
77
+ if not raw_dir.exists():
78
+ raise FileNotFoundError(f"Directory not found: {raw_dir}. Please run extract_pdfs.py first.")
79
+
80
+ all_qa_pairs = []
81
+ sources_processed = []
82
+
83
+ for raw_file in raw_dir.glob('*_raw.json'):
84
+ print(f"\nProcessing {raw_file.name}...")
85
+
86
+ with open(raw_file, 'r', encoding='utf-8') as f:
87
+ raw_data = json.load(f)
88
+
89
+ # Create source info
90
+ source_info = {
91
+ 'title': raw_data['filename'],
92
+ 'extraction_date': raw_data['extraction_date'],
93
+ 'total_pages': raw_data['total_pages']
94
+ }
95
+
96
+ # Combine all page text
97
+ full_text = ' '.join(
98
+ page['text'] for page in raw_data['pages']
99
+ if 'text' in page
100
+ )
101
+
102
+ # Generate QA pairs
103
+ qa_pairs = self.generate_qa_pairs(full_text, source_info)
104
+
105
+ # Save source-specific QA pairs
106
+ source_output = self.qa_dir / f"{raw_file.stem.replace('_raw', '')}_qa.jsonl"
107
+ with open(source_output, 'w', encoding='utf-8') as f:
108
+ for pair in qa_pairs:
109
+ f.write(json.dumps(pair) + '\n')
110
+
111
+ # Add to manifest
112
+ self.add_to_manifest(
113
+ raw_file,
114
+ source_output,
115
+ 'qa_generation',
116
+ {
117
+ 'pairs_generated': len(qa_pairs),
118
+ 'source': source_info['title']
119
+ }
120
+ )
121
+
122
+ all_qa_pairs.extend(qa_pairs)
123
+ sources_processed.append(source_info)
124
+
125
+ print(f"Generated {len(qa_pairs)} Q&A pairs")
126
+
127
+ # Save combined QA pairs
128
+ combined_output = self.qa_dir / 'combined_qa.jsonl'
129
+ with open(combined_output, 'w', encoding='utf-8') as f:
130
+ # Write metadata first
131
+ metadata = {
132
+ 'timestamp': datetime.now().isoformat(),
133
+ 'total_pairs': len(all_qa_pairs),
134
+ 'sources': sources_processed
135
+ }
136
+ f.write(json.dumps(metadata) + '\n')
137
+
138
+ # Write all QA pairs
139
+ for pair in all_qa_pairs:
140
+ f.write(json.dumps(pair) + '\n')
141
+
142
+ # Save QA generation manifest
143
+ manifest_file = self.qa_dir / 'qa_generation_manifest.json'
144
+ with open(manifest_file, 'w', encoding='utf-8') as f:
145
+ json.dump(self.manifest, f, indent=2)
146
+
147
+ print("\nQ&A Generation Summary:")
148
+ print(f"Total sources processed: {len(sources_processed)}")
149
+ print(f"Total Q&A pairs generated: {len(all_qa_pairs)}")
150
+ print(f"Individual source files saved in: {self.qa_dir}")
151
+ print(f"Combined Q&A pairs saved as: {combined_output}")
152
+ print(f"Provenance data saved as: {manifest_file}")
153
+
154
+ if __name__ == "__main__":
155
+ generator = QAGenerator()
156
+ generator.process_all_sources()