import run import util import docx from docx.oxml.ns import qn from docx.shared import Pt,RGBColor import fitz import os from fpdf import FPDF import run from BERT_inference import BertClassificationModel def text_dump_to_lines(text,topic_num,max_length): lines = util.seg(text) sentences = run.texClear(lines) print(sentences) keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length)) keysText = "\n".join(keys) outputText = "\n".join(output) print(keys,output) return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output) def file_dump_to_lines(file,topic_num,max_length): lines = [] # print(file.name) fileFormat = file.name.split(".")[-1] # print(fileFormat) if fileFormat == "txt": with open(file.name, encoding='utf-8') as f: content = f.read() lines = [x.strip() for x in content.split("\n") if x.strip()!=''] elif fileFormat == "docx": doc=docx.Document(file.name) paragraphs = doc.paragraphs lines = [par.text for par in paragraphs] elif fileFormat == "pdf": pdf = fitz.open(file.name) for page in pdf: pageText = page.get_text("text") lines.extend([x.strip() for x in pageText.split("\n") if x.strip()!='']) # print(lines) text = "\n".join(lines) print(text) keysText, outputText, txt_path, docx_path, pdf_path = text_dump_to_lines(text,topic_num,max_length) # sentences = run.texClear(lines) # keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length)) # keysText = "\n".join(keys) # outputText = "\n".join(output) # # text = "\n".join(lines) # # return text, text, dump_to_txt(lines), dump_to_docx(lines), dump_to_pdf(lines) return keysText, outputText, txt_path, docx_path, pdf_path def dump_to_txt(lines): text = "\n".join(lines) with open('temp.txt',mode="w",encoding="utf-8") as f: f.write(text) path = os.path.abspath('temp.txt') return path def dump_to_docx(lines): document = docx.Document() document.styles['Normal'].font.name = u'宋体' document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') document.styles['Normal'].font.size = Pt(14) document.styles['Normal'].font.color.rgb = RGBColor(0,0,0) paragraph = document.add_paragraph() run = paragraph.add_run() #run.font.name = 'Times New Roman' run.font.name=u'Cambria' run.font.color.rgb = RGBColor(0,0,0) run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria') for line in lines: document.add_paragraph(line) document.save(r'temp.docx') path = os.path.abspath('temp.docx') return path def dump_to_pdf(lines): pdf = FPDF() #读取字体文件 pdf.add_font('FZY3JW', '', 'FZY3JW.TTF', True) pdf.add_page() #设置pdf字体大小 pdf.set_font("FZY3JW", size=12) #打开txt文本 try: #按行读取txt文本内容 for line in lines: str=line num=len(str) temp=45#判断标志,实现pdf文件每行最多村45个字符 for j in range(0,num,temp): if(j+temp