Spaces:
Runtime error
Runtime error
import docx | |
import os | |
# sys.path.append('path to app') | |
# import docx | |
# import os | |
# import sys | |
from src.model.paragraph import Paragraph | |
class WordReader: | |
def __init__(self, path): | |
self.path = path | |
self.paragraphs = self.get_word_paragraphs() | |
def get_word_paragraphs(self): | |
""" | |
Fetches paragraphs from a Word document. | |
Returns: | |
list: List of Paragraph objects from the document. | |
""" | |
if not os.path.exists(self.path): | |
raise FileNotFoundError(f"The file {self.path} does not exist.") | |
try: | |
doc = docx.Document(self.path) | |
paragraphs = self.to_paragraph_objects(doc.paragraphs) # Convert to Paragraph objects | |
return paragraphs | |
except Exception as e: | |
raise ValueError(f"Error reading the .docx file. Original error: {str(e)}") | |
def determine_style(self, paragraph): | |
""" | |
Determines the style of the paragraph based on its attributes. | |
Returns: | |
str: Style of the paragraph. | |
""" | |
# Check for heading styles first | |
if paragraph.style.name.startswith('Heading 1'): | |
return "title1" | |
elif paragraph.style.name.startswith('Heading 2'): | |
return "title2" | |
elif paragraph.style.name.startswith('Heading 3'): | |
return "title3" | |
elif paragraph.style.name.startswith('Heading 4'): | |
return "title4" | |
elif paragraph.style.name.startswith('Heading 5'): | |
return "title5" | |
# If not a heading, check the runs within the paragraph | |
for run in paragraph.runs: | |
font = run.font | |
fontname = font.name | |
size = font.size | |
# Convert size to points (from twips) | |
if size: | |
size_in_points = size.pt | |
# Map based on font name and size as in the PDF reader | |
if fontname == "XFQKGD+Consolas": | |
return "code" | |
elif (size_in_points >= 9 and size_in_points < 11.5) or fontname == "Wingdings-Regular": | |
return "content" | |
# If none of the above conditions match, default to 'content' | |
return "content" | |
def to_paragraph_objects(self, doc_paragraphs): | |
""" | |
Convert docx paragraphs to Paragraph objects for further processing. | |
""" | |
paragraph_objects = [] | |
for idx, paragraph in enumerate(doc_paragraphs): | |
style = self.determine_style(paragraph) | |
# Assuming page_id is always 1 for simplicity, change as needed. | |
p_obj = Paragraph(text=paragraph.text, font_style=style, id_=idx, page_id=1) | |
paragraph_objects.append(p_obj) | |
paragraphs = self.rearrange_paragraphs(paragraph_objects) | |
return paragraphs | |
def rearrange_paragraphs(self, paragraphs : [Paragraph]): | |
#associate paragraphs with the same font style | |
i = 0 | |
while i < len(paragraphs): | |
paragraphs[i] = paragraphs[i].rearrange_paragraph() | |
i+=1 | |
return paragraphs | |
def display_paragraphs(self): | |
""" | |
Prints the paragraphs from the document to the console. | |
""" | |
for paragraph in self.paragraphs: | |
print(paragraph.text) | |
print('-' * 40) # separator for clarity | |
# if __name__ == '__main__': | |
# reader = WordReader("Illumio_Core_REST_API_Developer_Guide_23.3.docx") | |
# reader.display_paragraphs() | |