# %% import re import fitz import pandas as pd # %% document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf" # %% skip_header_offset = 1 regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))") regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)") # %% document = "" page_article = {} pdf_page_offset = 1 with fitz.open(document_path) as doc: for page_idx, page in enumerate(doc, pdf_page_offset): text = page.get_text() document += text articles = regex_article.findall(text) for article in articles: page_article[article] = page_idx len(page_article) # %% chapters = {} chapter_name = "header" splited_chapters = regex_chapters.split(document) for chapter in splited_chapters[skip_header_offset:]: if chapter.startswith("CAPÍTULO"): chapter_name = chapter.replace(" \n", ": ") else: chapters[chapter_name] = chapter len(chapters), chapters.keys() # %% minimum_article_length = 65 def format_article(article): articles = article.lstrip('- ').split("\n \n") formated_articles = [] for article in articles: formated_article = article.replace("\n", "").replace("*", "").strip() is_article_single = formated_article.startswith("El Estado") is_article_too_short = len(formated_article) <= minimum_article_length if is_article_too_short and not is_article_single: continue formated_articles.append(formated_article) sentence = " ".join(formated_articles) return sentence # %% chapter_articles = [] for chapter_name, chapter in chapters.items(): article_name = "header" splited_articles = regex_article.split(chapter) for article in splited_articles[skip_header_offset:]: if regex_article.match(article): article_name = article continue data = { "chapter_name": chapter_name, "article_page": page_article.get(article_name), "article_name": article_name, "article": format_article(article), } chapter_articles.append(data) # %% df_document = pd.DataFrame.from_dict(chapter_articles) df_document["article_number"] = ( df_document['article_name'] .str.extract(r'(^\d+)', expand=False) ) df_document["article_name"] = ( df_document['article_name'] .str.extract(r'^\d+\.- ?(.*)', expand=False) .str.rstrip(".-") ) df_document.head() # %% df_document.to_csv("data/articles.csv", index=False) # %%