|
import docx |
|
import os |
|
from docx.document import Document as _Document |
|
from src.domain.requirements_paragraphs import Requirement_Paragraph |
|
from docx.oxml.text.paragraph import CT_P |
|
from docx.oxml.table import CT_Tbl |
|
from docx.table import _Cell, Table |
|
from docx.text.paragraph import Paragraph |
|
|
|
class WordReader: |
|
|
|
def __init__(self, path): |
|
self.path = path |
|
self.paragraphs = self.get_paragraphs() |
|
|
|
def iter_block_items(self, parent): |
|
if isinstance(parent, _Document): |
|
parent_elm = parent.element.body |
|
elif isinstance(parent, _Cell): |
|
parent_elm = parent._tc |
|
else: |
|
raise ValueError("Unsupported parent type") |
|
|
|
for child in parent_elm.iterchildren(): |
|
if isinstance(child, CT_P): |
|
yield Paragraph(child, parent) |
|
elif isinstance(child, CT_Tbl): |
|
yield Table(child, parent) |
|
|
|
def get_paragraphs(self): |
|
if not os.path.exists(self.path): |
|
raise FileNotFoundError(f"The file {self.path} does not exist.") |
|
try: |
|
doc = docx.Document(self.path) |
|
paragraph_objects = [] |
|
paragraph_id = 0 |
|
page_id = 1 |
|
total_characters = 0 |
|
for block in self.iter_block_items(doc): |
|
if isinstance(block, Paragraph): |
|
paragraph_info = self.extract_paragraph_info(block) |
|
if paragraph_info: |
|
page_id = self.estimate_page_number(total_characters) |
|
p_obj = Requirement_Paragraph(text=paragraph_info['text'], font_style=paragraph_info['style'], id_=paragraph_id, page_id=page_id) |
|
|
|
paragraph_objects.append(p_obj) |
|
paragraph_id += 1 |
|
total_characters += len(paragraph_info['text']) |
|
elif isinstance(block, Table): |
|
table_paragraph, table_style = self.table_to_paragraph(block) |
|
if table_paragraph.strip(): |
|
|
|
p_obj = Requirement_Paragraph(text=table_paragraph, font_style=table_style, id_=paragraph_id, page_id=page_id) |
|
paragraph_objects.append(p_obj) |
|
paragraph_id += 1 |
|
return paragraph_objects |
|
except Exception as e: |
|
raise ValueError(f"Error reading the .docx file. Original error: {str(e)}") |
|
|
|
|
|
def determine_predominant_style(self, styles): |
|
|
|
style_counts = {} |
|
for style in styles: |
|
if style in style_counts: |
|
style_counts[style] += 1 |
|
else: |
|
style_counts[style] = 1 |
|
|
|
|
|
predominant_style = max(style_counts, key=style_counts.get, default="None") |
|
return predominant_style |
|
|
|
def estimate_page_number(self, total_characters): |
|
avg_chars_per_page = 2000 |
|
return total_characters // avg_chars_per_page + 1 |
|
|
|
def extract_paragraph_info(self, paragraph): |
|
|
|
if not paragraph.text.strip(): |
|
return None |
|
|
|
paragraph_style = paragraph.style.name if paragraph.style else 'None' |
|
|
|
runs = [] |
|
for run in paragraph.runs: |
|
run_details = { |
|
'text': run.text, |
|
'font_name': run.font.name, |
|
'font_size': run.font.size.pt if run.font.size else None, |
|
'bold': run.bold, |
|
'italic': run.italic, |
|
'underline': run.underline |
|
} |
|
runs.append(run_details) |
|
|
|
return { |
|
'text': paragraph.text, |
|
'style': paragraph_style, |
|
'runs': runs |
|
} |
|
|
|
|
|
|
|
def table_to_paragraph(self, table): |
|
table_text = "" |
|
table_styles = set() |
|
|
|
for row in table.rows: |
|
for cell in row.cells: |
|
cell_text = "" |
|
for paragraph in cell.paragraphs: |
|
paragraph_style = paragraph.style.name if paragraph.style else 'None' |
|
table_styles.add(paragraph_style) |
|
|
|
for run in paragraph.runs: |
|
cell_text += run.text |
|
|
|
cell_text += " " |
|
table_text += cell_text.strip() + " | " |
|
table_text = table_text.strip() + "\n" |
|
|
|
predominant_style = self.determine_predominant_style(table_styles) |
|
|
|
return table_text.strip(), predominant_style |
|
|
|
def print_paragraphs_and_tables(self): |
|
try: |
|
print("start") |
|
doc_items = self.get_paragraphs() |
|
for item in doc_items: |
|
if 'paragraph' in item: |
|
print("Paragraph:", item['paragraph']['text']) |
|
elif 'table' in item: |
|
print("Table:") |
|
for row in item['table']: |
|
for cell in row: |
|
for paragraph in cell: |
|
print(" Cell Paragraph:", paragraph['text']) |
|
print('-' * 40) |
|
|
|
except Exception as e: |
|
print(f"Error: {str(e)}") |
|
|