Tabular-LLM-Study-Forward-Simulation

Running

File size: 2,038 Bytes

import os
import re
from pathlib import Path
from bs4 import BeautifulSoup


def process_html_file(file_path, output_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    soup = BeautifulSoup(content, 'html.parser')

    # Find the Statement line
    statement_tag = soup.find(lambda tag: tag.name == "h3" and tag.find("span", string="Statement:"))

    if statement_tag:
        # Extract the text content
        statement_text = statement_tag.get_text(strip=True)

        # Remove "in the table:" and everything after it
        new_statement = re.sub(r'\s*in the table:.*$', '', statement_text, flags=re.DOTALL)

        # Reconstruct the h3 tag with the modified content
        new_h3 = soup.new_tag('h3')
        new_span = soup.new_tag('span')
        new_span.string = 'Statement:'
        new_h3.append(new_span)
        new_h3.append(f" {new_statement}")

        # Replace the old h3 tag with the new one
        statement_tag.replace_with(new_h3)

    # Write the modified content
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(str(soup))


def process_directory(input_dir, output_dir):
    subfolders = ['TP', 'TN', 'FP', 'FN']

    for subfolder in subfolders:
        input_subfolder = Path(input_dir) / subfolder
        output_subfolder = Path(output_dir) / subfolder

        if not input_subfolder.exists():
            print(f"Warning: {input_subfolder} does not exist. Skipping.")
            continue

        output_subfolder.mkdir(parents=True, exist_ok=True)

        for file in input_subfolder.glob('*.html'):
            output_file = output_subfolder / file.name
            process_html_file(file, output_file)
            print(f"Processed: {file} -> {output_file}")


# Define input and output directories
input_directory = "htmls_DATER_mod"
output_directory = "htmls_DATER_mod2"

# Process the files
process_directory(input_directory, output_directory)

print("Processing complete. Modified files are in the output directory.")