File size: 1,470 Bytes
bd161ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from docx import Document
from typing import List, Tuple
import os

def extract_questions_from_docx(docx_path: str) -> List[str]:
    """Extracts questions from a .docx file. Assumes each question is a separate paragraph or numbered list item."""
    doc = Document(docx_path)
    questions = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text and (text.endswith('?') or text.startswith('Q')):
            questions.append(text)
    # Also check for numbered/bulleted lists
    for para in doc.paragraphs:
        if para.style.name.startswith('List') and para.text.strip():
            questions.append(para.text.strip())
    # Remove duplicates
    questions = list(dict.fromkeys(questions))
    return questions

def extract_instructions_from_docx(docx_path: str) -> str:
    """Extracts the main instruction text from a .docx file (all non-question paragraphs)."""
    doc = Document(docx_path)
    instructions = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text and not text.endswith('?'):
            instructions.append(text)
    return '\n'.join(instructions)

def find_docx_file_in_folder(folder_path: str) -> str:
    """Finds the first .docx file in a folder and returns its path."""
    for fname in os.listdir(folder_path):
        if fname.lower().endswith('.docx'):
            return os.path.join(folder_path, fname)
    raise FileNotFoundError(f"No .docx file found in {folder_path}")