|
import gradio as gr |
|
from setfit import SetFitModel |
|
|
|
import PyPDF2 |
|
import openpyxl |
|
import os |
|
import glob |
|
import re |
|
import itertools |
|
import platform |
|
from tempfile import TemporaryDirectory |
|
from pathlib import Path |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
from PIL import Image |
|
|
|
|
|
|
|
def ocrtotext(filename): |
|
|
|
image_file_list = [] |
|
PDF_file = filename |
|
text_file = (f'{filename}.txt') |
|
|
|
|
|
pdf_pages = convert_from_path(PDF_file, 300) |
|
|
|
|
|
|
|
for page_enumeration, page in enumerate(pdf_pages, start=1): |
|
|
|
filename = f"page_{page_enumeration:03}.jpg" |
|
|
|
|
|
page.save(filename, "JPEG") |
|
image_file_list.append(filename) |
|
|
|
|
|
output = '' |
|
|
|
|
|
with open(text_file, "a") as output_file: |
|
|
|
|
|
for image_file in image_file_list: |
|
|
|
|
|
|
|
text = str(((pytesseract.image_to_string(Image.open(image_file))))) |
|
|
|
|
|
text = text.replace("-\n", "") |
|
|
|
|
|
output += text |
|
|
|
|
|
output_file.write(output) |
|
return output |
|
|
|
def cortar_en_bloques(texto, longitud_bloque): |
|
palabras = texto.split() |
|
bloques = [] |
|
bloque_actual = [] |
|
|
|
for palabra in palabras: |
|
bloque_actual.append(palabra) |
|
|
|
if len(bloque_actual) == longitud_bloque: |
|
bloques.append(" ".join(bloque_actual)) |
|
bloque_actual = [] |
|
|
|
|
|
if bloque_actual: |
|
bloques.append(" ".join(bloque_actual)) |
|
|
|
return bloques |
|
|
|
|
|
model = SetFitModel.from_pretrained("desarrolloasesoreslocales/SetFitPruebaRecorte") |
|
|
|
|
|
|
|
|
|
def predict(file): |
|
|
|
recorte_general = "" |
|
ocr_text = ocrtotext(file.name) |
|
|
|
chunks = cortar_en_bloques(ocr_text, 150) |
|
first = -1 |
|
margin = int(len(chunks) * 0.25) |
|
chunks_removable = chunks[:margin] + chunks[-margin:] |
|
|
|
for i in range(len(chunks)): |
|
print('Recortando -', round((i/len(chunks))*100), '%') |
|
if chunks[i] not in chunks_removable or model.predict([chunks[i]]).item() == 1: |
|
if first == -1: |
|
first = i |
|
recorte_general += chunks[i] + " " |
|
|
|
if first > 0: |
|
recorte_general = chunks[first-1] + recorte_general |
|
print(100, '%') |
|
|
|
recorte_final = "" |
|
|
|
|
|
|
|
|
|
chunks2 = cortar_en_bloques(recorte_general, 80) |
|
margin_s = int(len(chunks2) * 0.1) |
|
margin_e = int(len(chunks2) * 0.1) |
|
|
|
chunks_removable2 = chunks2[:margin_s] + chunks2[-margin_e:] |
|
|
|
|
|
for i in range(len(chunks2)): |
|
print('Recortando -', round((i/len(chunks2))*100), '%') |
|
if chunks2[i] not in chunks_removable2 or model.predict([chunks2[i]]).item() == 1: |
|
recorte_final += chunks2[i] + " " |
|
print(100, '%') |
|
return recorte_final |
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict, |
|
inputs=gr.inputs.File(), |
|
outputs=gr.Textbox(), |
|
live=False, |
|
title="Recortador de Texto" |
|
) |
|
|
|
|
|
iface.launch() |