File size: 9,288 Bytes
87ce387 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
import json
# Input and output files
input_file = "cleaned_big_book.jsonl"
output_file = "qa_dataset.jsonl"
def create_qa_pair(prompt, response, source_info, location="chapter_overview", max_length=300):
"""
Creates a formatted Q&A pair with variable response lengths based on question type.
"""
# Clean and truncate response based on max_length
cleaned_response = response[:max_length] if response else ""
# Create a single clean object
return {
"prompt": f"### Question: {prompt}\n\n### Answer:",
"response": cleaned_response,
"metadata": {
"book": "Alcoholics Anonymous",
"chapter": source_info["chapter"],
"section": location,
"edition": "First 164 pages",
"type": "primary_text"
}
}
def generate_qa_pairs(chapter, text, source_info):
"""Generates comprehensive Q&A pairs"""
qa_pairs = []
paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 100]
# 1. Basic Questions (for all chapters)
qa_pairs.extend([
create_qa_pair(f"What is {chapter} about?", text, source_info),
create_qa_pair(f"What are the main principles discussed in {chapter}?", text, source_info),
create_qa_pair(f"What are the key takeaways from {chapter}?", text, source_info)
])
# 2. Recovery-Specific Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} help someone stay sober?", text, source_info),
create_qa_pair(f"What solutions to alcoholism are presented in {chapter}?", text, source_info),
create_qa_pair(f"What role does surrender play in {chapter}?", text, source_info)
])
# 3. Emotional/Mental Questions
qa_pairs.extend([
create_qa_pair(f"What fears are addressed in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} deal with resentment?", text, source_info),
create_qa_pair(f"What mental shifts are suggested in {chapter}?", text, source_info)
])
# 4. Spiritual Growth Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} address spiritual growth?", text, source_info),
create_qa_pair(f"What spiritual principles are discussed in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} help develop faith?", text, source_info)
])
# 5. Practical Action Questions
qa_pairs.extend([
create_qa_pair(f"What specific actions are recommended in {chapter}?", text, source_info),
create_qa_pair(f"What daily practices are suggested in {chapter}?", text, source_info),
create_qa_pair(f"What habits need changing according to {chapter}?", text, source_info)
])
# 6. Fellowship Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} discuss helping others?", text, source_info),
create_qa_pair(f"What role does sponsorship play in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} address working with newcomers?", text, source_info)
])
# 7. Personal Experience Questions
qa_pairs.extend([
create_qa_pair(f"What personal experiences are shared in {chapter}?", text, source_info),
create_qa_pair(f"What transformations are described in {chapter}?", text, source_info),
create_qa_pair(f"What struggles and victories are mentioned in {chapter}?", text, source_info)
])
# 8. Relationship Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} address family relationships?", text, source_info),
create_qa_pair(f"What guidance about relationships is given in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} discuss making amends?", text, source_info)
])
# 9. Common Obstacles Questions
qa_pairs.extend([
create_qa_pair(f"What obstacles to recovery are discussed in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} address denial?", text, source_info),
create_qa_pair(f"What solutions to common problems are offered in {chapter}?", text, source_info)
])
# 10. Chapter-Specific Questions
if chapter == "THE DOCTOR'S OPINION":
qa_pairs.extend([
create_qa_pair(f"What medical perspective is shared in {chapter}?", text, source_info),
create_qa_pair(f"How does the doctor describe alcoholism?", text, source_info),
create_qa_pair(f"What physical aspects of alcoholism are discussed?", text, source_info)
])
elif chapter in ["BILL'S STORY", "A VISION FOR YOU"]:
qa_pairs.extend([
create_qa_pair(f"What was the turning point in this story?", text, source_info),
create_qa_pair(f"How did spiritual experience play a role?", text, source_info),
create_qa_pair(f"What was the progression of alcoholism described?", text, source_info)
])
elif chapter in ["HOW IT WORKS", "INTO ACTION", "WORKING WITH OTHERS"]:
qa_pairs.extend([
create_qa_pair(f"What specific steps are outlined in {chapter}?", text, source_info),
create_qa_pair(f"How should one practice these principles?", text, source_info),
create_qa_pair(f"What actions are essential according to {chapter}?", text, source_info)
])
# 11. Add paragraph-specific questions for substance
if len(paragraphs) > 2:
for i, para in enumerate(paragraphs[:3]):
qa_pairs.append(
create_qa_pair(f"What key point is made in paragraph {i+1} of {chapter}?",
para, source_info)
)
return qa_pairs
def extract_key_concepts(text):
"""Extract important AA concepts from text"""
concepts = [
"recovery", "sobriety", "alcoholism", "spiritual", "fellowship",
"steps", "program", "healing", "hope", "solution", "experience",
"strength", "faith", "willingness", "honesty", "humility",
"surrender", "acceptance", "service", "meditation", "prayer",
"amends", "inventory", "powerlessness", "unity", "sponsorship"
]
return [c for c in concepts if c.lower() in text.lower()]
def find_relevant_excerpt(text, concept, max_length=300):
"""Find relevant text portion for a concept."""
sentences = text.split('.')
for sentence in sentences:
if concept.lower() in sentence.lower():
return sentence[:max_length]
return text[:max_length]
def clean_chapter_name(chapter):
"""Clean up chapter names"""
# First remove any trailing periods and spaces
chapter = chapter.strip(". ")
# Extract number if it's in "Chapter X" format
if chapter.startswith("Chapter "):
chapter = chapter.split(" ")[1]
# Map numbers to proper names (using actual Big Book chapter names)
chapter_map = {
"1": "BILL'S STORY",
"2": "THERE IS A SOLUTION",
"3": "MORE ABOUT ALCOHOLISM",
"4": "WE AGNOSTICS",
"5": "HOW IT WORKS",
"6": "INTO ACTION",
"7": "WORKING WITH OTHERS",
"8": "TO WIVES",
"9": "THE FAMILY AFTERWARD",
"10": "TO EMPLOYERS",
"11": "A VISION FOR YOU",
"12": "A WAY OUT",
"000": "THE DOCTOR'S OPINION",
"32": "FOREWORD",
"1935": "HISTORICAL NOTE"
}
return chapter_map.get(chapter, chapter)
# Main processing
if __name__ == "__main__":
qa_data = []
processed_chapters = set() # Keep track of chapters we've already processed
try:
print(f"Reading from {input_file}...")
with open(input_file, "r") as f:
for i, line in enumerate(f, 1):
entry = json.loads(line)
original_chapter = entry.get("chapter", "Unnamed Chapter")
chapter = clean_chapter_name(original_chapter)
# Skip if we've already processed this chapter
if chapter in processed_chapters:
print(f"Skipping duplicate chapter: {original_chapter} -> {chapter}")
continue
processed_chapters.add(chapter)
text = entry.get("text", "")
print(f"Processing chapter {len(processed_chapters)}: {original_chapter} -> {chapter}")
source_info = {
"chapter": chapter,
"text_type": "chapter_content"
}
qa_pairs = generate_qa_pairs(chapter, text, source_info)
qa_data.extend(qa_pairs)
print(f"Generated {len(qa_pairs)} Q&A pairs for {chapter}")
print(f"\nWriting {len(qa_data)} Q&A pairs to {output_file}")
with open(output_file, "w") as f:
for qa in qa_data:
f.write(json.dumps(qa) + "\n")
print(f"\nFinal Statistics:")
print(f"Total unique chapters processed: {len(processed_chapters)}")
print(f"Total Q&A pairs generated: {len(qa_data)}")
except FileNotFoundError:
print(f"Error: Could not find input file '{input_file}'")
except Exception as e:
print(f"Error: {str(e)}") |