TSA / textInput.py
QINGCHE's picture
fix
7589954
import run
import util
import docx
from docx.oxml.ns import qn
from docx.shared import Pt,RGBColor
import fitz
import os
from fpdf import FPDF
import run
from BERT_inference import BertClassificationModel
def text_dump_to_lines(text,topic_num,max_length):
lines = util.seg(text)
sentences = run.texClear(lines)
print(sentences)
keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
keysText = "\n".join(keys)
outputText = "\n".join(output)
print(keys,output)
return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output)
def file_dump_to_lines(file,topic_num,max_length):
lines = []
# print(file.name)
fileFormat = file.name.split(".")[-1]
# print(fileFormat)
if fileFormat == "txt":
with open(file.name, encoding='utf-8') as f:
content = f.read()
lines = [x.strip() for x in content.split("\n") if x.strip()!='']
elif fileFormat == "docx":
doc=docx.Document(file.name)
paragraphs = doc.paragraphs
lines = [par.text for par in paragraphs]
elif fileFormat == "pdf":
pdf = fitz.open(file.name)
for page in pdf:
pageText = page.get_text("text")
lines.extend([x.strip() for x in pageText.split("\n") if x.strip()!=''])
# print(lines)
# text = "\n".join(lines)
sentences = run.texClear(lines)
keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
keysText = "\n".join(keys)
outputText = "\n".join(output)
# text = "\n".join(lines)
# return text, text, dump_to_txt(lines), dump_to_docx(lines), dump_to_pdf(lines)
return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output)
def dump_to_txt(lines):
text = "\n".join(lines)
with open('temp.txt',mode="w",encoding="utf-8") as f:
f.write(text)
path = os.path.abspath('temp.txt')
return path
def dump_to_docx(lines):
document = docx.Document()
document.styles['Normal'].font.name = u'宋体'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
document.styles['Normal'].font.size = Pt(14)
document.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
paragraph = document.add_paragraph()
run = paragraph.add_run()
#run.font.name = 'Times New Roman'
run.font.name=u'Cambria'
run.font.color.rgb = RGBColor(0,0,0)
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria')
for line in lines:
document.add_paragraph(line)
document.save(r'temp.docx')
path = os.path.abspath('temp.docx')
return path
def dump_to_pdf(lines):
pdf = FPDF()
#读取字体文件
pdf.add_font('FZY3JW', '', 'FZY3JW.TTF', True)
pdf.add_page()
#设置pdf字体大小
pdf.set_font("FZY3JW", size=12)
#打开txt文本
try:
#按行读取txt文本内容
for line in lines:
str=line
num=len(str)
temp=45#判断标志,实现pdf文件每行最多村45个字符
for j in range(0,num,temp):
if(j+temp<num):
data=str[j:j+temp]
else:
data=str[j:num]
pdf.cell(0, 5, data, ln=1)
except Exception as e:
print(e)
pdf.output("temp.pdf")
path = os.path.abspath('temp.pdf')
return path
if __name__ == "__main__":
with open('test.txt', 'r', encoding='utf-8') as f:
data = f.read()
# print(data)
text_dump_to_lines(data,10,50)