Spaces:

Lukeam
/

aa_book

Sleeping

File size: 9,288 Bytes

87ce387

import json

# Input and output files
input_file = "cleaned_big_book.jsonl"
output_file = "qa_dataset.jsonl"

def create_qa_pair(prompt, response, source_info, location="chapter_overview", max_length=300):
    """
    Creates a formatted Q&A pair with variable response lengths based on question type.
    """
    # Clean and truncate response based on max_length
    cleaned_response = response[:max_length] if response else ""
    
    # Create a single clean object
    return {
        "prompt": f"### Question: {prompt}\n\n### Answer:",
        "response": cleaned_response,
        "metadata": {
            "book": "Alcoholics Anonymous",
            "chapter": source_info["chapter"],
            "section": location,
            "edition": "First 164 pages",
            "type": "primary_text"
        }
    }

def generate_qa_pairs(chapter, text, source_info):
    """Generates comprehensive Q&A pairs"""
    qa_pairs = []
    paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 100]
    
    # 1. Basic Questions (for all chapters)
    qa_pairs.extend([
        create_qa_pair(f"What is {chapter} about?", text, source_info),
        create_qa_pair(f"What are the main principles discussed in {chapter}?", text, source_info),
        create_qa_pair(f"What are the key takeaways from {chapter}?", text, source_info)
    ])
    
    # 2. Recovery-Specific Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} help someone stay sober?", text, source_info),
        create_qa_pair(f"What solutions to alcoholism are presented in {chapter}?", text, source_info),
        create_qa_pair(f"What role does surrender play in {chapter}?", text, source_info)
    ])
    
    # 3. Emotional/Mental Questions
    qa_pairs.extend([
        create_qa_pair(f"What fears are addressed in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} deal with resentment?", text, source_info),
        create_qa_pair(f"What mental shifts are suggested in {chapter}?", text, source_info)
    ])
    
    # 4. Spiritual Growth Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} address spiritual growth?", text, source_info),
        create_qa_pair(f"What spiritual principles are discussed in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} help develop faith?", text, source_info)
    ])
    
    # 5. Practical Action Questions
    qa_pairs.extend([
        create_qa_pair(f"What specific actions are recommended in {chapter}?", text, source_info),
        create_qa_pair(f"What daily practices are suggested in {chapter}?", text, source_info),
        create_qa_pair(f"What habits need changing according to {chapter}?", text, source_info)
    ])
    
    # 6. Fellowship Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} discuss helping others?", text, source_info),
        create_qa_pair(f"What role does sponsorship play in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} address working with newcomers?", text, source_info)
    ])
    
    # 7. Personal Experience Questions
    qa_pairs.extend([
        create_qa_pair(f"What personal experiences are shared in {chapter}?", text, source_info),
        create_qa_pair(f"What transformations are described in {chapter}?", text, source_info),
        create_qa_pair(f"What struggles and victories are mentioned in {chapter}?", text, source_info)
    ])
    
    # 8. Relationship Questions
    qa_pairs.extend([
        create_qa_pair(f"How does {chapter} address family relationships?", text, source_info),
        create_qa_pair(f"What guidance about relationships is given in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} discuss making amends?", text, source_info)
    ])
    
    # 9. Common Obstacles Questions
    qa_pairs.extend([
        create_qa_pair(f"What obstacles to recovery are discussed in {chapter}?", text, source_info),
        create_qa_pair(f"How does {chapter} address denial?", text, source_info),
        create_qa_pair(f"What solutions to common problems are offered in {chapter}?", text, source_info)
    ])

    # 10. Chapter-Specific Questions
    if chapter == "THE DOCTOR'S OPINION":
        qa_pairs.extend([
            create_qa_pair(f"What medical perspective is shared in {chapter}?", text, source_info),
            create_qa_pair(f"How does the doctor describe alcoholism?", text, source_info),
            create_qa_pair(f"What physical aspects of alcoholism are discussed?", text, source_info)
        ])
    elif chapter in ["BILL'S STORY", "A VISION FOR YOU"]:
        qa_pairs.extend([
            create_qa_pair(f"What was the turning point in this story?", text, source_info),
            create_qa_pair(f"How did spiritual experience play a role?", text, source_info),
            create_qa_pair(f"What was the progression of alcoholism described?", text, source_info)
        ])
    elif chapter in ["HOW IT WORKS", "INTO ACTION", "WORKING WITH OTHERS"]:
        qa_pairs.extend([
            create_qa_pair(f"What specific steps are outlined in {chapter}?", text, source_info),
            create_qa_pair(f"How should one practice these principles?", text, source_info),
            create_qa_pair(f"What actions are essential according to {chapter}?", text, source_info)
        ])

    # 11. Add paragraph-specific questions for substance
    if len(paragraphs) > 2:
        for i, para in enumerate(paragraphs[:3]):
            qa_pairs.append(
                create_qa_pair(f"What key point is made in paragraph {i+1} of {chapter}?", 
                             para, source_info)
            )

    return qa_pairs

def extract_key_concepts(text):
    """Extract important AA concepts from text"""
    concepts = [
        "recovery", "sobriety", "alcoholism", "spiritual", "fellowship",
        "steps", "program", "healing", "hope", "solution", "experience",
        "strength", "faith", "willingness", "honesty", "humility",
        "surrender", "acceptance", "service", "meditation", "prayer",
        "amends", "inventory", "powerlessness", "unity", "sponsorship"
    ]
    return [c for c in concepts if c.lower() in text.lower()]

def find_relevant_excerpt(text, concept, max_length=300):
    """Find relevant text portion for a concept."""
    sentences = text.split('.')
    for sentence in sentences:
        if concept.lower() in sentence.lower():
            return sentence[:max_length]
    return text[:max_length]

def clean_chapter_name(chapter):
    """Clean up chapter names"""
    # First remove any trailing periods and spaces
    chapter = chapter.strip(". ")
    
    # Extract number if it's in "Chapter X" format
    if chapter.startswith("Chapter "):
        chapter = chapter.split(" ")[1]
    
    # Map numbers to proper names (using actual Big Book chapter names)
    chapter_map = {
        "1": "BILL'S STORY",
        "2": "THERE IS A SOLUTION",
        "3": "MORE ABOUT ALCOHOLISM",
        "4": "WE AGNOSTICS",
        "5": "HOW IT WORKS",
        "6": "INTO ACTION",
        "7": "WORKING WITH OTHERS",
        "8": "TO WIVES",
        "9": "THE FAMILY AFTERWARD",
        "10": "TO EMPLOYERS",
        "11": "A VISION FOR YOU",
        "12": "A WAY OUT",
        "000": "THE DOCTOR'S OPINION",
        "32": "FOREWORD",
        "1935": "HISTORICAL NOTE"
    }
    
    return chapter_map.get(chapter, chapter)

# Main processing
if __name__ == "__main__":
    qa_data = []
    processed_chapters = set()  # Keep track of chapters we've already processed
    
    try:
        print(f"Reading from {input_file}...")
        with open(input_file, "r") as f:
            for i, line in enumerate(f, 1):
                entry = json.loads(line)
                original_chapter = entry.get("chapter", "Unnamed Chapter")
                chapter = clean_chapter_name(original_chapter)
                
                # Skip if we've already processed this chapter
                if chapter in processed_chapters:
                    print(f"Skipping duplicate chapter: {original_chapter} -> {chapter}")
                    continue
                
                processed_chapters.add(chapter)
                text = entry.get("text", "")
                
                print(f"Processing chapter {len(processed_chapters)}: {original_chapter} -> {chapter}")
                
                source_info = {
                    "chapter": chapter,
                    "text_type": "chapter_content"
                }
                
                qa_pairs = generate_qa_pairs(chapter, text, source_info)
                qa_data.extend(qa_pairs)
                print(f"Generated {len(qa_pairs)} Q&A pairs for {chapter}")

        print(f"\nWriting {len(qa_data)} Q&A pairs to {output_file}")
        with open(output_file, "w") as f:
            for qa in qa_data:
                f.write(json.dumps(qa) + "\n")

        print(f"\nFinal Statistics:")
        print(f"Total unique chapters processed: {len(processed_chapters)}")
        print(f"Total Q&A pairs generated: {len(qa_data)}")
        
    except FileNotFoundError:
        print(f"Error: Could not find input file '{input_file}'")
    except Exception as e:
        print(f"Error: {str(e)}")