File size: 3,793 Bytes
cbc1d23
8ba144e
 
 
 
 
 
 
 
 
 
cbc1d23
7589954
8ba144e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fd45d8
 
 
 
 
 
 
 
 
 
8ba144e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbc1d23
8ba144e
 
 
bff547d
8ba144e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbc1d23
8ba144e
bff547d
8ba144e
 
7589954
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import run
import util
import docx
from docx.oxml.ns import qn
from docx.shared import Pt,RGBColor
import fitz
import os
from fpdf import FPDF
import run
from BERT_inference import BertClassificationModel


def text_dump_to_lines(text,topic_num,max_length):
    lines = util.seg(text)
    sentences  = run.texClear(lines)
    print(sentences)
    keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
    keysText = "\n".join(keys)
    outputText = "\n".join(output)
    print(keys,output)
    return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output)

def file_dump_to_lines(file,topic_num,max_length):
    lines = []
    # print(file.name)
    fileFormat = file.name.split(".")[-1]
    # print(fileFormat)
    if fileFormat == "txt":
        with open(file.name,  encoding='utf-8') as f:
            content = f.read()
            lines = [x.strip() for x in content.split("\n") if x.strip()!='']
    elif fileFormat == "docx":
        doc=docx.Document(file.name)
        paragraphs = doc.paragraphs
        lines = [par.text for par in paragraphs]
    elif fileFormat == "pdf":
        pdf = fitz.open(file.name)
        for page in pdf:
            pageText = page.get_text("text")
            lines.extend([x.strip() for x in pageText.split("\n") if x.strip()!=''])
    # print(lines)
    text = "\n".join(lines)
    print(text)
    keysText, outputText, txt_path, docx_path, pdf_path = text_dump_to_lines(text,topic_num,max_length)
    # sentences = run.texClear(lines)
    # keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
    # keysText = "\n".join(keys)
    # outputText = "\n".join(output)
    # # text = "\n".join(lines)
    # # return text, text, dump_to_txt(lines), dump_to_docx(lines), dump_to_pdf(lines)
    return keysText, outputText, txt_path, docx_path, pdf_path 

def dump_to_txt(lines):
    text = "\n".join(lines)
    with open('temp.txt',mode="w",encoding="utf-8") as f:
        f.write(text)
    path = os.path.abspath('temp.txt')
    return path

def dump_to_docx(lines):
    document = docx.Document()
    document.styles['Normal'].font.name = u'宋体'
    document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    document.styles['Normal'].font.size = Pt(14)
    document.styles['Normal'].font.color.rgb = RGBColor(0,0,0)


    paragraph = document.add_paragraph()
    run = paragraph.add_run()
    #run.font.name = 'Times New Roman' 
    run.font.name=u'Cambria'
    run.font.color.rgb = RGBColor(0,0,0)
    run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria')

    for line in lines:
        document.add_paragraph(line)
    
    document.save(r'temp.docx')
    path = os.path.abspath('temp.docx')

    return path

def dump_to_pdf(lines):
    pdf = FPDF()
    #读取字体文件
    pdf.add_font('FZY3JW', '', 'FZY3JW.TTF', True)
    pdf.add_page()
    #设置pdf字体大小
    pdf.set_font("FZY3JW", size=12)
    #打开txt文本
    try:
        #按行读取txt文本内容
        for line in lines:
            str=line
            num=len(str)
            temp=45#判断标志,实现pdf文件每行最多村45个字符
            for j in range(0,num,temp):
                if(j+temp<num):
                    data=str[j:j+temp]
                else:
                    data=str[j:num]
                pdf.cell(0, 5, data, ln=1)
    except Exception as e:
        print(e)
    pdf.output("temp.pdf")
    path = os.path.abspath('temp.pdf')
    return path

if __name__ == "__main__":
    with open('test.txt', 'r', encoding='utf-8') as f:
        data = f.read()
        # print(data)
        text_dump_to_lines(data,10,50)