Spaces:
Runtime error
Runtime error
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
import PyPDF2 | |
import gradio as gr | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.summarize import load_summarize_chain | |
from huggingface_hub import login | |
from pathlib import Path | |
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import torch | |
import os | |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN') | |
# Realizar el inicio de sesi贸n de Hugging Face solo si el token est谩 disponible | |
if huggingface_token: | |
login(token=huggingface_token) | |
# Configuraci贸n del modelo LLM | |
llm = HuggingFaceEndpoint( | |
repo_id="mistralai/Mistral-7B-Instruct-v0.3", | |
task="text-generation", | |
max_new_tokens=4096, | |
temperature=0.5, | |
do_sample=False, | |
return_full_text=True, | |
) | |
llm_engine_hf = ChatHuggingFace(llm=llm) | |
# Configuraci贸n del modelo de clasificaci贸n | |
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") | |
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") | |
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"} | |
def read_file(file): | |
file_path = file.name | |
if file_path.endswith('.pdf'): | |
return read_pdf(file_path) | |
else: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return f.read() | |
def read_pdf(file_path): | |
pdf_reader = PyPDF2.PdfReader(file_path) | |
text = "" | |
for page in range(len(pdf_reader.pages)): | |
text += pdf_reader.pages[page].extract_text() | |
return text | |
def summarize(text, summary_length): | |
if summary_length == 'Corto': | |
length_instruction = "El resumen debe tener un m谩ximo de 100 palabras." | |
elif summary_length == 'Medio': | |
length_instruction = "El resumen debe tener un m谩ximo de 500 palabras." | |
else: | |
length_instruction = "El resumen debe tener un m谩ximo de 1000 palabras." | |
template = f''' | |
Por favor, lea detenidamente el siguiente documento: | |
<document> | |
{{TEXT}} | |
</document> | |
Despu茅s de leer el documento, identifique los puntos clave y las ideas principales cubiertas en el texto. {length_instruction} | |
Su objetivo es ser exhaustivo en la captura del contenido central del documento, mientras que tambi茅n es conciso en la expresi贸n de cada punto del resumen. Omita los detalles menores y conc茅ntrese en los temas centrales y hechos importantes. | |
''' | |
prompt = PromptTemplate( | |
template=template, | |
input_variables=['TEXT'] | |
) | |
formatted_prompt = prompt.format(TEXT=text) | |
output_summary = llm_engine_hf.invoke(formatted_prompt) | |
return output_summary.content | |
def classify_text(text): | |
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length") | |
model.eval() | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
logits = outputs.logits | |
predicted_class_id = logits.argmax(dim=-1).item() | |
predicted_label = id2label[predicted_class_id] | |
return f"Clasificaci贸n: {predicted_label}" | |
def translate(text, target_language): | |
template = ''' | |
Por favor, traduzca el siguiente documento al {LANGUAGE}: | |
<document> | |
{TEXT} | |
</document> | |
Aseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento. | |
''' | |
prompt = PromptTemplate( | |
template=template, | |
input_variables=['TEXT', 'LANGUAGE'] | |
) | |
formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language) | |
translated_text = llm_engine_hf.invoke(formatted_prompt) | |
return translated_text.content | |
def process_file(file, action, target_language=None, summary_length=None): | |
text = read_file(file) | |
if action == "Resumen": | |
return summarize(text, summary_length) | |
elif action == "Clasificar": | |
return classify_text(text) | |
elif action == "Traducir": | |
return translate(text, target_language) | |
else: | |
return "Acci贸n no v谩lida" | |
# Crear la interfaz de Gradio | |
with gr.Blocks() as demo: | |
gr.Markdown("## LexAIcon: Traducci贸n Resumen y Clasificaci贸n de textos legales.") | |
with gr.Row(): | |
with gr.Column(): | |
file = gr.File(label="Subir un archivo") | |
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"]) | |
summary_length = gr.Radio(label="Seleccione la longitud del resumen", choices=["Corto", "Medio", "Largo"], visible=False) | |
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False) | |
with gr.Column(): | |
output_text = gr.Textbox(label="Resultado", lines=20) | |
def update_ui(action): | |
if action == "Traducir": | |
return gr.update(visible=False), gr.update(visible=True) | |
elif action == "Resumen": | |
return gr.update(visible=True), gr.update(visible=False) | |
elif action == "Clasificar": | |
return gr.update(visible=False), gr.update(visible(False)) | |
else: | |
return gr.update(visible=False), gr.update(visible(False)) | |
action.change(update_ui, inputs=action, outputs=[summary_length, target_language]) | |
submit_button = gr.Button("Procesar") | |
submit_button.click(process_file, inputs=[file, action, target_language, summary_length], outputs=output_text) | |
# Ejecutar la aplicaci贸n Gradio | |
demo.launch(share=True) |