File size: 9,288 Bytes
87ce387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import json

# Input and output files
input_file = "cleaned_big_book.jsonl"
output_file = "qa_dataset.jsonl"

def create_qa_pair(prompt, response, source_info, location="chapter_overview", max_length=300):
    """
    Creates a formatted Q&A pair with variable response lengths based on question type.
    """
    # Clean and truncate response based on max_length
    cleaned_response = response[:max_length] if response else ""
    
    # Create a single clean object
    return {
        "prompt": f"### Question: {prompt}\n\n### Answer:",
        "response": cleaned_response,
        "metadata": {
            "book": "Alcoholics Anonymous",
            "chapter": source_info["chapter"],
            "section": location,
            "edition": "First 164 pages",
            "type": "primary_text"
        }
    }

def generate_qa_pairs(chapter, text, source_info):
    """Generates comprehensive Q&A pairs"""
    qa_pairs = []
    paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 100]
    
    # 1. Basic Questions (for all chapters)
    qa_pairs.extend([
        create_qa_pair(f"What is {chapter} about?", text, source_info),
        create_qa_pair(f"What are the main principles discussed in {chapter}?", text, source_info),
        create_qa_pair(f"What are the key takeaways from {chapter}?", text, source_info)
    ])
    
    # 2. Recovery-Specific Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} help someone stay sober?", text, source_info),
        create_qa_pair(f"What solutions to alcoholism are presented in {chapter}?", text, source_info),
        create_qa_pair(f"What role does surrender play in {chapter}?", text, source_info)
    ])
    
    # 3. Emotional/Mental Questions
    qa_pairs.extend([
        create_qa_pair(f"What fears are addressed in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} deal with resentment?", text, source_info),
        create_qa_pair(f"What mental shifts are suggested in {chapter}?", text, source_info)
    ])
    
    # 4. Spiritual Growth Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} address spiritual growth?", text, source_info),
        create_qa_pair(f"What spiritual principles are discussed in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} help develop faith?", text, source_info)
    ])
    
    # 5. Practical Action Questions
    qa_pairs.extend([
        create_qa_pair(f"What specific actions are recommended in {chapter}?", text, source_info),
        create_qa_pair(f"What daily practices are suggested in {chapter}?", text, source_info),
        create_qa_pair(f"What habits need changing according to {chapter}?", text, source_info)
    ])
    
    # 6. Fellowship Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} discuss helping others?", text, source_info),
        create_qa_pair(f"What role does sponsorship play in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} address working with newcomers?", text, source_info)
    ])
    
    # 7. Personal Experience Questions
    qa_pairs.extend([
        create_qa_pair(f"What personal experiences are shared in {chapter}?", text, source_info),
        create_qa_pair(f"What transformations are described in {chapter}?", text, source_info),
        create_qa_pair(f"What struggles and victories are mentioned in {chapter}?", text, source_info)
    ])
    
    # 8. Relationship Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} address family relationships?", text, source_info),
        create_qa_pair(f"What guidance about relationships is given in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} discuss making amends?", text, source_info)
    ])
    
    # 9. Common Obstacles Questions
    qa_pairs.extend([
        create_qa_pair(f"What obstacles to recovery are discussed in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} address denial?", text, source_info),
        create_qa_pair(f"What solutions to common problems are offered in {chapter}?", text, source_info)
    ])

    # 10. Chapter-Specific Questions
    if chapter == "THE DOCTOR'S OPINION":
        qa_pairs.extend([
            create_qa_pair(f"What medical perspective is shared in {chapter}?", text, source_info),
            create_qa_pair(f"How does the doctor describe alcoholism?", text, source_info),
            create_qa_pair(f"What physical aspects of alcoholism are discussed?", text, source_info)
        ])
    elif chapter in ["BILL'S STORY", "A VISION FOR YOU"]:
        qa_pairs.extend([
            create_qa_pair(f"What was the turning point in this story?", text, source_info),
            create_qa_pair(f"How did spiritual experience play a role?", text, source_info),
            create_qa_pair(f"What was the progression of alcoholism described?", text, source_info)
        ])
    elif chapter in ["HOW IT WORKS", "INTO ACTION", "WORKING WITH OTHERS"]:
        qa_pairs.extend([
            create_qa_pair(f"What specific steps are outlined in {chapter}?", text, source_info),
            create_qa_pair(f"How should one practice these principles?", text, source_info),
            create_qa_pair(f"What actions are essential according to {chapter}?", text, source_info)
        ])

    # 11. Add paragraph-specific questions for substance
    if len(paragraphs) > 2:
        for i, para in enumerate(paragraphs[:3]):
            qa_pairs.append(
                create_qa_pair(f"What key point is made in paragraph {i+1} of {chapter}?", 
                             para, source_info)
            )

    return qa_pairs

def extract_key_concepts(text):
    """Extract important AA concepts from text"""
    concepts = [
        "recovery", "sobriety", "alcoholism", "spiritual", "fellowship",
        "steps", "program", "healing", "hope", "solution", "experience",
        "strength", "faith", "willingness", "honesty", "humility",
        "surrender", "acceptance", "service", "meditation", "prayer",
        "amends", "inventory", "powerlessness", "unity", "sponsorship"
    ]
    return [c for c in concepts if c.lower() in text.lower()]

def find_relevant_excerpt(text, concept, max_length=300):
    """Find relevant text portion for a concept."""
    sentences = text.split('.')
    for sentence in sentences:
        if concept.lower() in sentence.lower():
            return sentence[:max_length]
    return text[:max_length]

def clean_chapter_name(chapter):
    """Clean up chapter names"""
    # First remove any trailing periods and spaces
    chapter = chapter.strip(". ")
    
    # Extract number if it's in "Chapter X" format
    if chapter.startswith("Chapter "):
        chapter = chapter.split(" ")[1]
    
    # Map numbers to proper names (using actual Big Book chapter names)
    chapter_map = {
        "1": "BILL'S STORY",
        "2": "THERE IS A SOLUTION",
        "3": "MORE ABOUT ALCOHOLISM",
        "4": "WE AGNOSTICS",
        "5": "HOW IT WORKS",
        "6": "INTO ACTION",
        "7": "WORKING WITH OTHERS",
        "8": "TO WIVES",
        "9": "THE FAMILY AFTERWARD",
        "10": "TO EMPLOYERS",
        "11": "A VISION FOR YOU",
        "12": "A WAY OUT",
        "000": "THE DOCTOR'S OPINION",
        "32": "FOREWORD",
        "1935": "HISTORICAL NOTE"
    }
    
    return chapter_map.get(chapter, chapter)

# Main processing
if __name__ == "__main__":
    qa_data = []
    processed_chapters = set()  # Keep track of chapters we've already processed
    
    try:
        print(f"Reading from {input_file}...")
        with open(input_file, "r") as f:
            for i, line in enumerate(f, 1):
                entry = json.loads(line)
                original_chapter = entry.get("chapter", "Unnamed Chapter")
                chapter = clean_chapter_name(original_chapter)
                
                # Skip if we've already processed this chapter
                if chapter in processed_chapters:
                    print(f"Skipping duplicate chapter: {original_chapter} -> {chapter}")
                    continue
                
                processed_chapters.add(chapter)
                text = entry.get("text", "")
                
                print(f"Processing chapter {len(processed_chapters)}: {original_chapter} -> {chapter}")
                
                source_info = {
                    "chapter": chapter,
                    "text_type": "chapter_content"
                }
                
                qa_pairs = generate_qa_pairs(chapter, text, source_info)
                qa_data.extend(qa_pairs)
                print(f"Generated {len(qa_pairs)} Q&A pairs for {chapter}")

        print(f"\nWriting {len(qa_data)} Q&A pairs to {output_file}")
        with open(output_file, "w") as f:
            for qa in qa_data:
                f.write(json.dumps(qa) + "\n")

        print(f"\nFinal Statistics:")
        print(f"Total unique chapters processed: {len(processed_chapters)}")
        print(f"Total Q&A pairs generated: {len(qa_data)}")
        
    except FileNotFoundError:
        print(f"Error: Could not find input file '{input_file}'")
    except Exception as e:
        print(f"Error: {str(e)}")