|
import re |
|
|
|
class AnalysisCleaner: |
|
def __init__(self): |
|
self.seen_paragraphs = set() |
|
|
|
def remove_duplicates(self, text: str) -> str: |
|
"""Remove duplicate paragraphs while preserving order""" |
|
paragraphs = text.split('\n\n') |
|
unique_paragraphs = [] |
|
|
|
for paragraph in paragraphs: |
|
|
|
normalized = ' '.join(paragraph.lower().split()) |
|
if normalized and normalized not in self.seen_paragraphs: |
|
self.seen_paragraphs.add(normalized) |
|
unique_paragraphs.append(paragraph) |
|
|
|
return '\n\n'.join(unique_paragraphs) |
|
|
|
def reorganize_content(self, text: str) -> str: |
|
"""Convert bullet points into flowing paragraphs""" |
|
lines = text.split('\n') |
|
current_paragraph = [] |
|
flowing_text = [] |
|
|
|
for line in lines: |
|
|
|
cleaned_line = re.sub(r'^\s*[\*\-\•]\s*', '', line) |
|
cleaned_line = re.sub(r'^\s*\d+\.\s*', '', cleaned_line) |
|
|
|
if cleaned_line.strip(): |
|
if cleaned_line.startswith('###'): |
|
if current_paragraph: |
|
flowing_text.append(' '.join(current_paragraph)) |
|
current_paragraph = [] |
|
flowing_text.append(cleaned_line) |
|
else: |
|
current_paragraph.append(cleaned_line) |
|
elif current_paragraph: |
|
flowing_text.append(' '.join(current_paragraph)) |
|
current_paragraph = [] |
|
|
|
if current_paragraph: |
|
flowing_text.append(' '.join(current_paragraph)) |
|
|
|
return '\n\n'.join(flowing_text) |
|
|
|
def clean_analysis(self, text: str) -> str: |
|
"""Apply all cleanup steps""" |
|
|
|
cleaned = self.remove_duplicates(text) |
|
|
|
|
|
cleaned = self.reorganize_content(cleaned) |
|
|
|
|
|
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) |
|
|
|
return cleaned |