Upload qanda_gen_script.py with huggingface_hub
Browse files- qanda_gen_script.py +156 -0
qanda_gen_script.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
from datetime import datetime
|
4 |
+
import hashlib
|
5 |
+
|
6 |
+
class QAGenerator:
|
7 |
+
def __init__(self):
|
8 |
+
# Fix paths to stay within aa_book directory
|
9 |
+
self.base_dir = Path(__file__).parent # current directory (aa_book)
|
10 |
+
self.output_dir = self.base_dir / 'processed_data'
|
11 |
+
self.qa_dir = self.output_dir / 'qa_pairs'
|
12 |
+
self.raw_dir = self.output_dir / 'raw_extractions'
|
13 |
+
print(f"Looking for raw extractions in: {self.raw_dir}")
|
14 |
+
self.qa_dir.mkdir(parents=True, exist_ok=True)
|
15 |
+
self.manifest = []
|
16 |
+
|
17 |
+
def add_to_manifest(self, input_file, output_file, process_type, metadata):
|
18 |
+
"""Track transformations in manifest"""
|
19 |
+
manifest_entry = {
|
20 |
+
'timestamp': datetime.now().isoformat(),
|
21 |
+
'input_file': str(input_file),
|
22 |
+
'output_file': str(output_file),
|
23 |
+
'process_type': process_type,
|
24 |
+
'metadata': metadata
|
25 |
+
}
|
26 |
+
self.manifest.append(manifest_entry)
|
27 |
+
|
28 |
+
def generate_qa_pairs(self, text, source_info):
|
29 |
+
"""Generate Q&A pairs from text"""
|
30 |
+
qa_pairs = []
|
31 |
+
|
32 |
+
# Split into sections (chapters, paragraphs)
|
33 |
+
sections = text.split('\n\n')
|
34 |
+
|
35 |
+
for i, section in enumerate(sections):
|
36 |
+
if len(section.strip()) < 100: # Skip short sections
|
37 |
+
continue
|
38 |
+
|
39 |
+
# Generate different types of questions
|
40 |
+
qa_pairs.extend([
|
41 |
+
{
|
42 |
+
'question': f"What are the main points discussed in this section of {source_info['title']}?",
|
43 |
+
'answer': section.strip(),
|
44 |
+
'source': source_info,
|
45 |
+
'section_index': i,
|
46 |
+
'qa_type': 'main_points',
|
47 |
+
'timestamp': datetime.now().isoformat()
|
48 |
+
},
|
49 |
+
{
|
50 |
+
'question': f"Can you summarize the key concepts from this passage in {source_info['title']}?",
|
51 |
+
'answer': section.strip(),
|
52 |
+
'source': source_info,
|
53 |
+
'section_index': i,
|
54 |
+
'qa_type': 'summary',
|
55 |
+
'timestamp': datetime.now().isoformat()
|
56 |
+
}
|
57 |
+
])
|
58 |
+
|
59 |
+
# Add specific AA-related questions if relevant keywords are found
|
60 |
+
if any(word in section.lower() for word in ['step', 'tradition', 'recovery', 'sobriety']):
|
61 |
+
qa_pairs.append({
|
62 |
+
'question': f"What recovery principles or concepts are discussed in this section of {source_info['title']}?",
|
63 |
+
'answer': section.strip(),
|
64 |
+
'source': source_info,
|
65 |
+
'section_index': i,
|
66 |
+
'qa_type': 'aa_specific',
|
67 |
+
'timestamp': datetime.now().isoformat()
|
68 |
+
})
|
69 |
+
|
70 |
+
return qa_pairs
|
71 |
+
|
72 |
+
def process_all_sources(self):
|
73 |
+
"""Process all extracted texts into QA pairs"""
|
74 |
+
# Update path to look in the correct location
|
75 |
+
raw_dir = self.output_dir / 'raw_extractions'
|
76 |
+
|
77 |
+
if not raw_dir.exists():
|
78 |
+
raise FileNotFoundError(f"Directory not found: {raw_dir}. Please run extract_pdfs.py first.")
|
79 |
+
|
80 |
+
all_qa_pairs = []
|
81 |
+
sources_processed = []
|
82 |
+
|
83 |
+
for raw_file in raw_dir.glob('*_raw.json'):
|
84 |
+
print(f"\nProcessing {raw_file.name}...")
|
85 |
+
|
86 |
+
with open(raw_file, 'r', encoding='utf-8') as f:
|
87 |
+
raw_data = json.load(f)
|
88 |
+
|
89 |
+
# Create source info
|
90 |
+
source_info = {
|
91 |
+
'title': raw_data['filename'],
|
92 |
+
'extraction_date': raw_data['extraction_date'],
|
93 |
+
'total_pages': raw_data['total_pages']
|
94 |
+
}
|
95 |
+
|
96 |
+
# Combine all page text
|
97 |
+
full_text = ' '.join(
|
98 |
+
page['text'] for page in raw_data['pages']
|
99 |
+
if 'text' in page
|
100 |
+
)
|
101 |
+
|
102 |
+
# Generate QA pairs
|
103 |
+
qa_pairs = self.generate_qa_pairs(full_text, source_info)
|
104 |
+
|
105 |
+
# Save source-specific QA pairs
|
106 |
+
source_output = self.qa_dir / f"{raw_file.stem.replace('_raw', '')}_qa.jsonl"
|
107 |
+
with open(source_output, 'w', encoding='utf-8') as f:
|
108 |
+
for pair in qa_pairs:
|
109 |
+
f.write(json.dumps(pair) + '\n')
|
110 |
+
|
111 |
+
# Add to manifest
|
112 |
+
self.add_to_manifest(
|
113 |
+
raw_file,
|
114 |
+
source_output,
|
115 |
+
'qa_generation',
|
116 |
+
{
|
117 |
+
'pairs_generated': len(qa_pairs),
|
118 |
+
'source': source_info['title']
|
119 |
+
}
|
120 |
+
)
|
121 |
+
|
122 |
+
all_qa_pairs.extend(qa_pairs)
|
123 |
+
sources_processed.append(source_info)
|
124 |
+
|
125 |
+
print(f"Generated {len(qa_pairs)} Q&A pairs")
|
126 |
+
|
127 |
+
# Save combined QA pairs
|
128 |
+
combined_output = self.qa_dir / 'combined_qa.jsonl'
|
129 |
+
with open(combined_output, 'w', encoding='utf-8') as f:
|
130 |
+
# Write metadata first
|
131 |
+
metadata = {
|
132 |
+
'timestamp': datetime.now().isoformat(),
|
133 |
+
'total_pairs': len(all_qa_pairs),
|
134 |
+
'sources': sources_processed
|
135 |
+
}
|
136 |
+
f.write(json.dumps(metadata) + '\n')
|
137 |
+
|
138 |
+
# Write all QA pairs
|
139 |
+
for pair in all_qa_pairs:
|
140 |
+
f.write(json.dumps(pair) + '\n')
|
141 |
+
|
142 |
+
# Save QA generation manifest
|
143 |
+
manifest_file = self.qa_dir / 'qa_generation_manifest.json'
|
144 |
+
with open(manifest_file, 'w', encoding='utf-8') as f:
|
145 |
+
json.dump(self.manifest, f, indent=2)
|
146 |
+
|
147 |
+
print("\nQ&A Generation Summary:")
|
148 |
+
print(f"Total sources processed: {len(sources_processed)}")
|
149 |
+
print(f"Total Q&A pairs generated: {len(all_qa_pairs)}")
|
150 |
+
print(f"Individual source files saved in: {self.qa_dir}")
|
151 |
+
print(f"Combined Q&A pairs saved as: {combined_output}")
|
152 |
+
print(f"Provenance data saved as: {manifest_file}")
|
153 |
+
|
154 |
+
if __name__ == "__main__":
|
155 |
+
generator = QAGenerator()
|
156 |
+
generator.process_all_sources()
|