File size: 2,750 Bytes
7a919c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from deepdoc.parser import RAGFlowPdfParser, PlainParser
import os
from PIL import Image
import json
class PDFprocess():
    def __init__(self, repodir, workdir):
        self.repodir = repodir
        self.preprocessdir = os.path.join(workdir,'preprocess')
        self.pdf_parser = RAGFlowPdfParser()
        # self.plain_parser = PlainParser()
            
    def _save_image(self,image, path, name):
        """ 保存图片到指定路径 """
        if not os.path.exists(path):
            os.makedirs(path)
        image_path = os.path.join(path, name)
        image.save(image_path)
        return image_path

    def save_all_image(self,preprocessdir,tables):
        image_folder = os.path.join(preprocessdir,'saved_images')
        # 假设 res 中包含了图片对象和其他数据
        for index, data in enumerate(tables):
            image, text = data  # 假设 data 结构是这样的
            image_path = self._save_image(image, image_folder, f'image_{index}.png')
            # relative_path = os.path.relpath(image_path, preprocessdir)
            tables[index] = (image_path, text)  # 更新 res 中的图片对象为图片路径
        return tables

    def create_html_file(self,tables, html_file_path):
        html_content = '<html><body>\n'
        for index, data in enumerate(tables):
            image_path, text = data
            # 创建图片链接和文本
            html_content += f'<img src="{image_path}" alt="Image">\n{text}\n'
        html_content += '</body></html>'
        
        # 写入 HTML 文件
        with open(html_file_path, 'w') as file:
            file.write(html_content)
    def process(self, pdffilename):
        pdf_file_path = os.path.join(self.repodir,pdffilename)
        text_content, tables = self.pdf_parser(pdf_file_path, need_image=False, zoomin=3, return_html=True)

        text_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.txt'))
        with open(text_file_path, 'w') as f:
            f.write(text_content)

        image_folder = os.path.join(self.preprocessdir,f'{pdffilename}_images')
        tables = self.save_all_image(image_folder,tables)

        html_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.html'))
        self.create_html_file(tables, html_file_path)

        json_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.json'))
        with open(json_file_path, 'w') as f:
            json.dump(tables, f, indent=4, ensure_ascii=False)
if __name__ == '__main__':
    repodir = '/Users/chen/Downloads/ReviewAgent2'
    preprocessdir = '/Users/chen/Downloads/ReviewAgent2/preprocess'
    pdfprocess = PDFprocess(repodir, preprocessdir)
    pdfprocess.process('test.pdf')