Spaces:

technicolor
/

InteractiveSurvey

Sleeping

File size: 14,383 Bytes

import os
import re
import json
import subprocess
import glob
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.schema import Document
import shutil
import tempfile
from .path_utils import get_path

class DocumentLoading:
    def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
        base_name = os.path.splitext(os.path.basename(pdf_file))[0]
        target_dir = os.path.join(output_dir, base_name)
        md_file_path = os.path.join(target_dir, method, f"{base_name}.md")
        print("The md file path is: ", md_file_path)

        if os.path.exists(md_file_path):
            print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
            return
            
        command = ["mineru", "-p", pdf_file, "-o", output_dir, "-m", method]
        try:
            subprocess.run(command, check=True)
            # 检查是否生成了 Markdown 文件
            if not os.path.exists(md_file_path):
                print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
                shutil.rmtree(target_dir)  # 删除生成的文件夹
            else:
                print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
        except subprocess.CalledProcessError as e:
            print(f"An error occurred during conversion: {e}")
            # 如果发生错误且文件夹已生成，则删除文件夹
            if os.path.exists(target_dir):
                print(f"Cleaning up incomplete folder: {target_dir}")
                shutil.rmtree(target_dir)
    # new
    def convert_pdf_to_md_new(self, pdf_dir, output_dir="output", method="auto"):
        pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

        for pdf_file in pdf_files:
            base_name = os.path.splitext(os.path.basename(pdf_file))[0]
            target_dir = os.path.join(output_dir, base_name)

            if os.path.exists(target_dir):
                print(f"Folder for {pdf_file} already exists in {output_dir}. Skipping conversion.")
            else:
                command = ["mineru", "-p", pdf_file, "-o", output_dir, "-m", method]
                try:
                    subprocess.run(command, check=True)
                    print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
                except subprocess.CalledProcessError as e:
                    print(f"An error occurred: {e}")

    def batch_convert_pdfs(pdf_files, output_dir="output", method="auto", max_workers=None):
        # Create a process pool to run the conversion in parallel
        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            # Submit each PDF file to the process pool for conversion
            futures = [executor.submit(convert_pdf_to_md, pdf, output_dir, method) for pdf in pdf_files]

            # Optionally, you can monitor the status of each future as they complete
            for future in futures:
                try:
                    future.result()  # This will raise any exceptions that occurred during the processing
                except Exception as exc:
                    print(f"An error occurred during processing: {exc}")

    def extract_information_from_md(self, md_text):
        title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL)
        title = title_match.group(1).strip() if title_match else "N/A"

        authors_match = re.search(
            r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)', 
            md_text, 
            re.DOTALL
        )
        authors = authors_match.group(1).strip() if authors_match else "N/A"

        abstract_match = re.search(
            r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)', 
            md_text, 
            re.DOTALL
        )
        abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
        abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract)
        abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)

        introduction_match = re.search(
            r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
            r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))',
            md_text, 
            re.DOTALL
        )
        introduction = introduction_match.group(2).strip() if introduction_match else "N/A"

        main_content_match = re.search(
            r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)', 
            md_text, 
            re.DOTALL
        )
        
        if main_content_match:
            main_content = main_content_match.group(1).strip()
        else:
            main_content = "N/A"

        extracted_data = {
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "introduction": introduction,
            "main_content": main_content
        }
        return extracted_data
    
    def process_md_file(self, md_file_path, survey_id):
        loader = UnstructuredMarkdownLoader(md_file_path)
        data = loader.load()
        assert len(data) == 1, "Expected exactly one document in the markdown file."
        assert isinstance(data[0], Document), "The loaded data is not of type Document."
        extracted_text = data[0].page_content
        
        extracted_data = self.extract_information_from_md(extracted_text)
        if len(extracted_data["abstract"]) < 10:
            extracted_data["abstract"] = extracted_data['title']

        title = os.path.splitext(os.path.basename(md_file_path))[0]
        title_new = title.strip()
        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
        for char in invalid_chars:
            title_new = title_new.replace(char, ' ')

        os.makedirs(get_path('txt', survey_id), exist_ok=True)
        with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
            json.dump(extracted_data, f, ensure_ascii=False, indent=4)
        return extracted_data['introduction']
    
    def process_md_file_full(self, md_file_path, survey_id):
        loader = UnstructuredMarkdownLoader(md_file_path)
        data = loader.load()
        assert len(data) == 1, "Expected exactly one document in the markdown file."
        assert isinstance(data[0], Document), "The loaded data is not of type Document."
        extracted_text = data[0].page_content
        
        extracted_data = self.extract_information_from_md(extracted_text)
        if len(extracted_data["abstract"]) < 10:
            extracted_data["abstract"] = extracted_data['title']

        title = os.path.splitext(os.path.basename(md_file_path))[0]
        title_new = title.strip()
        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
        for char in invalid_chars:
            title_new = title_new.replace(char, ' ')
 
        os.makedirs(get_path('txt', survey_id), exist_ok=True)
        with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
            json.dump(extracted_data, f, ensure_ascii=False, indent=4)
        return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']

    def load_pdf(self, pdf_file, survey_id, mode):
        """ 
        Parameters
        ----------
        pdf_file : str
            绝对路径 PDF 文件
        survey_id : str
            当前 survey ID，用于组织输出目录
        mode : str
            前端传递的模式，用于控制提取 intro 还是全文，
            可能为 intro / full / auto / txt / ocr。

        设计：
        • mineru 只支持 auto / txt / ocr，这里统一用 'auto'（或保留传入的合法值），
          与前端 intro/full 概念解耦。
        • read_type 控制返回介绍还是全文：
              - mode == 'intro'  →  只返回 introduction
              - 其它             →  返回全文（abstract+intro+main）
        """

        valid_mineru_methods = ['auto', 'txt', 'ocr']
        if mode in valid_mineru_methods:
            mineru_method = mode
            read_type = 'full'
        else:
            mineru_method = 'auto'  # 默认的 mineru 解析方式
            read_type = 'intro' if mode == 'intro' else 'full'

        base_name = os.path.splitext(os.path.basename(pdf_file))[0]
        target_dir = os.path.join(get_path('md', survey_id), base_name)
        # mineru 会把 md 文件放到  <target_dir>/<mineru_method>/<name>.md
        md_file_path = os.path.join(target_dir, mineru_method, f"{base_name}.md")
        print("The md file path is: ", md_file_path)

        if os.path.exists(md_file_path):
            print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
            if read_type == 'intro':
                return self.process_md_file(md_file_path, survey_id)
            else:
                return self.process_md_file_full(md_file_path, survey_id)

        command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_method]
        try:
            subprocess.run(command, check=True)
            # 检查是否生成了 Markdown 文件
            if not os.path.exists(md_file_path):
                print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
                shutil.rmtree(target_dir)  # 删除生成的文件夹
                return None
            else:
                print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
                if read_type == 'intro':
                    return self.process_md_file(md_file_path, survey_id)
                else:
                    return self.process_md_file_full(md_file_path, survey_id)
        except subprocess.CalledProcessError as e:
            print(f"An error occurred during conversion: {e}")
            # 如果发生错误且文件夹已生成，则删除文件夹
            if os.path.exists(target_dir):
                print(f"Cleaning up incomplete folder: {target_dir}")
                shutil.rmtree(target_dir)
            return None
    
    def load_pdf_new(self, pdf_dir, survey_id):
        pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

        for pdf_file in pdf_files:
            base_name = os.path.splitext(os.path.basename(pdf_file))[0]
            target_dir = os.path.join(get_path('md', survey_id), base_name)

            if os.path.exists(target_dir):
                print(f"Folder for {pdf_file} already exists in {get_path('md', survey_id)}. Skipping conversion.")
            else:
                command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", "auto"]
                try:
                    subprocess.run(command, check=True)
                    print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
                except subprocess.CalledProcessError as e:
                    print(f"An error occurred: {e}")

    def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
        # Create a process pool to run the conversion in parallel
        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            # Submit each PDF file to the process pool for conversion
            futures = [executor.submit(self.load_pdf, pdf, survey_id, "auto") for pdf in pdf_files]

            # Optionally, you can monitor the status of each future as they complete
            for future in futures:
                try:
                    future.result()  # This will raise any exceptions that occurred during the processing
                except Exception as exc:
                    print(f"An error occurred during processing: {exc}")

    def ensure_non_empty_introduction(self, introduction, full_text):
        if len(introduction) < 50:
            return full_text[:1000]
        return introduction

    def extract_information_from_md_new(self, md_text):
        # Title extraction
        title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL)
        title = title_match.group(1).strip() if title_match else "N/A"

        # Authors extraction
        authors_match = re.search(
            r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)', 
            md_text, 
            re.DOTALL
        )
        authors = authors_match.group(1).strip() if authors_match else "N/A"

        # Abstract extraction
        abstract_match = re.search(
            r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)', 
            md_text, 
            re.DOTALL
        )
        abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
        abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract)
        abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)

        # Introduction extraction
        introduction_match = re.search(
            r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
            r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))',
            md_text, 
            re.DOTALL
        )
        introduction = introduction_match.group(2).strip() if introduction_match else "N/A"

        # Main content extraction
        main_content_match = re.search(
            r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)', 
            md_text, 
            re.DOTALL
        )
        
        if main_content_match:
            main_content = main_content_match.group(1).strip()
        else:
            main_content = "N/A"

        extracted_data = {
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "introduction": introduction,
            "main_content": main_content
        }
        return extracted_data