File size: 11,377 Bytes
dbf97ba 954fd40 dbf97ba 9f63e0a 0b9c07f 9f63e0a 0b9c07f dbf97ba fa09874 04fa692 cf618bb fa09874 5f421ac fa09874 dbf97ba 3e45198 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
import pdfplumber
import re
import os
# Extract text as paragraph delimiter without tables and graphs
def extract_and_format_paragraphs(pdf_path):
"""Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
# Define patterns for headers, footnotes, and specific lines
header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space
start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
# Define common abbreviations and patterns that should not be considered as end-of-sentence
#exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
def remove_abbreviation_periods(text):
# Define regex patterns for common abbreviations where periods should be ignored
abbreviations = [
for abbr in abbreviations:
# Remove periods in abbreviations at the end of the text
text = re.sub(f'({abbr})\.', r'\1', text)
return text
def is_end_of_sentence(text):
# Strip leading and trailing whitespace
text = text.strip()
# Remove periods in common abbreviations from the end of the text
text = remove_abbreviation_periods(text)
# Define regex patterns for sentence-ending punctuation
sentence_end_re = re.compile(r'[\.\!\?]\s*$')
# Check if the text ends with sentence-ending punctuation
return bool(
def clean_text(text):
"""Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
lines = text.split('\n')
filtered_lines = []
in_removal_section = False
paragraph_lines = []
def is_footnote_line(line):
"""Check if a line matches the footnote pattern."""
return footnote_pattern.match(line)
def append_line_to_paragraph(line):
"""Append the line to the paragraph, handling line breaks and footnotes."""
if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
# This line is a continuation of the previous one
if paragraph_lines[-1][-1] == "-":
paragraph_lines[-1] = paragraph_lines[-1][:-1]
paragraph_lines[-1] += line.strip()
paragraph_lines[-1] += ' ' + line.strip()
# Start a new line in the paragraph
skip_line = False
for line in lines:
# Check for start and end markers
if start_marker_pattern.match(line):
in_removal_section = True
if in_removal_section and end_marker_pattern.match(line):
in_removal_section = False
# Handle footnotes
if is_footnote_line(line):
skip_line = True
if skip_line:
if is_end_of_sentence(line):
skip_line = False
# Filter out headers and footers
if not header_pattern.match(line) and \
not footer_pattern.match(line) and \
not in_removal_section:
# Remove unnecessary line breaks and append line to paragraph_lines
if line.strip():
# Join all paragraph lines into a single paragraph text, removing unnecessary newlines
cleaned_paragraphs = "\n".join(paragraph_lines)
return cleaned_paragraphs
full_text = ""
previous_page_text = ""
with as pdf:
if "minutes" in os.path.basename(pdf_path).lower():
with as pdf:
for page_num, page in enumerate(pdf.pages):
# Get the page dimensions
width = page.width
height = page.height
header_height = height * 0.075 # Adjust this value based on your PDF
footer_height = height * 0.15 # Adjust this value based on your PDF
left_bbox = (0, header_height, width / 2, height - footer_height) # Left column
right_bbox = (width / 2, header_height, width, height - footer_height)
# Extract text from the left column
left_column_text = page.within_bbox(left_bbox).extract_text() or ""
# Clean the left column text
cleaned_left_text = clean_text(left_column_text)
# Extract text from the right column
right_column_text = page.within_bbox(right_bbox).extract_text() or ""
# Clean the right column text
cleaned_right_text = clean_text(right_column_text)
# Handle text from previous page
if previous_page_text:
# Check if the previous page text ends with punctuation
if not is_end_of_sentence(previous_page_text):
# Append the current page's left column text to previous page text
previous_page_text += " " + cleaned_left_text
# Add previous page text to full text
full_text += previous_page_text + "\n"
# Reset previous page text to current left column text
previous_page_text = cleaned_left_text
previous_page_text = cleaned_left_text
# Process the right column text
if previous_page_text:
# Check if the previous page text ends with punctuation
if not is_end_of_sentence(previous_page_text):
# Append the right column text to previous page text
previous_page_text += " " + cleaned_right_text
# Add previous page text to full text
full_text += previous_page_text + "\n"
# Reset previous page text to current right column text
previous_page_text = cleaned_right_text
previous_page_text = cleaned_right_text
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
# Clean and format the page text
cleaned_text = clean_text(page_text)
# Handle text from previous page
if previous_page_text:
# Check if the previous page text ends with punctuation
if not is_end_of_sentence(previous_page_text):
# Append the current page text to previous page text
previous_page_text += " " + cleaned_text
# Add previous page text to full text
full_text += previous_page_text + "\n"
# Reset previous page text
previous_page_text = cleaned_text
previous_page_text = cleaned_text
# Add remaining text from the last page
if previous_page_text:
full_text += previous_page_text
return full_text.strip()
# Cleaning: cut unecessary information such as annex and intro
def find_text_range(text, start_keywords, end_keywords):
"""Find the text range between start and multiple end keywords."""
start_index = 0
for start_keyword in start_keywords:
keyword_index = text.lower().find(start_keyword.lower())
if keyword_index != -1 and keyword_index > start_index:
start_index = keyword_index
#start_index = text.lower().find(start_keyword.lower())
# Find the earliest occurrence of any end keyword
end_index = len(text) # Default to end of text
for end_keyword in end_keywords:
keyword_index = text.lower().find(end_keyword.lower())
if keyword_index != -1 and keyword_index < end_index:
end_index = keyword_index
return start_index, end_index
def extract_relevant_text(text, start_index, end_index):
"""Extract text from the start index to the end index."""
return text[start_index:end_index].strip()
# Split paragraphs into list of paragraphs
def split_text_into_paragraphs(extracted_text, min_length):
Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
# Split the text into paragraphs based on newlines
paragraphs = re.split(r'\n+', extracted_text.strip())
def is_end_of_sentence(text):
"""Check if the text ends with punctuation indicating the end of a sentence."""
return bool('[.!?]$', text.strip()))
def count_sentences(text):
"""Count the number of sentences in a text."""
return len(re.split(r'(?<=[.!?])\s+', text.strip()))
def merge_single_sentence_paragraphs(paragraphs):
"""Merge single-sentence paragraphs with the next paragraph if necessary."""
merged_paragraphs = []
i = 0
while i < len(paragraphs):
para = paragraphs[i].strip()
if not para:
i += 1
if count_sentences(para) == 1 and i + 1 < len(paragraphs):
# Check if the next paragraph should be merged with the current one
next_para = paragraphs[i + 1].strip()
if next_para:
# Merge single-sentence paragraph with the next paragraph
merged_paragraphs.append(para + ' ' + next_para)
i += 2 # Skip the next paragraph since it has been merged
# If the next paragraph is empty, just add the current paragraph
i += 1
# Add the current paragraph if it has more than one sentence or is the last one
i += 1
return merged_paragraphs
# Filter out paragraphs that are too short
filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
# Merge single-sentence paragraphs
final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
return final_paragraphs |