Spaces:

manuelcozar55
/

LexAIcon_Mistral7B

Paused

App Files Files Community

LexAIcon_Mistral7B / app.py

manuelcozar55

Update app.py

99b5108 verified 8 months ago

raw

history blame

6.05 kB

	import warnings
	warnings.simplefilter(action='ignore', category=FutureWarning)

	import PyPDF2
	import gradio as gr
	from langchain.prompts import PromptTemplate
	from langchain.chains.summarize import load_summarize_chain
	from huggingface_hub import login
	from pathlib import Path
	from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import os

	huggingface_token = os.getenv('HUGGINGFACE_TOKEN')

	# Realizar el inicio de sesión de Hugging Face solo si el token está disponible
	if huggingface_token:
	login(token=huggingface_token)

	# Configuración del modelo de resumen
	llm = HuggingFaceEndpoint(
	repo_id="mistralai/Mistral-7B-Instruct-v0.3",
	task="text-generation",
	max_new_tokens=4096,
	temperature=0.5,
	do_sample=False,
	)
	llm_engine_hf = ChatHuggingFace(llm=llm)

	# Configuración del modelo de clasificación
	tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
	model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")

	id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}

	def read_pdf(file_path):
	pdf_reader = PyPDF2.PdfReader(file_path)
	text = ""
	for page in range(len(pdf_reader.pages)):
	text += pdf_reader.pages[page].extract_text()
	return text

	def summarize(file):
	# Leer el contenido del archivo subido
	file_path = file.name
	if file_path.endswith('.pdf'):
	text = read_pdf(file_path)
	else:
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()

	template = '''
	Please carefully read the following document:
	<document>
	{TEXT}
	</document>
	After reading through the document, identify the key points and main ideas covered in the text. Organize these key points into a concise bulleted list that summarizes the essential information from the document. The summary should have a maximum of 10 bullet points.
	Your goal is to be comprehensive in capturing the core content of the document, while also being concise in how you express each summary point. Omit minor details and focus on the central themes and important facts.
	'''

	prompt = PromptTemplate(
	template=template,
	input_variables=['TEXT']
	)

	formatted_prompt = prompt.format(TEXT=text)
	output_summary = llm_engine_hf.invoke(formatted_prompt)

	return output_summary.content

	def classify_text(text):
	inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
	model.eval()
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	predicted_class_id = logits.argmax(dim=-1).item()
	predicted_label = id2label[predicted_class_id]
	return predicted_label

	def translate(file, target_language):
	# Leer el contenido del archivo subido
	file_path = file.name
	if file_path.endswith('.pdf'):
	text = read_pdf(file_path)
	else:
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()

	template = '''
	Please translate the following document to {LANGUAGE}:
	<document>
	{TEXT}
	</document>
	Ensure that the translation is accurate and preserves the original meaning of the document.
	'''

	prompt = PromptTemplate(
	template=template,
	input_variables=['TEXT', 'LANGUAGE']
	)

	formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language)
	translated_text = llm_engine_hf.invoke(formatted_prompt)

	return translated_text

	def process_file(file, action, target_language=None):
	if action == "Resumen":
	return summarize(file)
	elif action == "Clasificar":
	file_path = file.name
	if file_path.endswith('.pdf'):
	text = read_pdf(file_path)
	else:
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	return classify_text(text)
	elif action == "Traducir":
	return translate(file, target_language)
	else:
	return "Acción no válida"

	def download_text(output_text, filename='output.txt'):
	if output_text:
	file_path = Path(filename)
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(output_text)
	return file_path
	else:
	return None

	def create_download_file(output_text, filename='output.txt'):
	file_path = download_text(output_text, filename)
	return str(file_path) if file_path else None

	# Crear la interfaz de Gradio
	with gr.Blocks() as demo:
	gr.Markdown("## Document Processor")

	with gr.Row():
	with gr.Column():
	file = gr.File(label="Subir un archivo")
	action = gr.Radio(label="Seleccione una acción", choices=["Resumen", "Clasificar", "Traducir"])
	target_language = gr.Dropdown(label="Seleccionar idioma de traducción", choices=["en", "fr", "de"], visible=False)

	with gr.Column():
	output_text = gr.Textbox(label="Resultado", lines=20)

	def update_language_dropdown(action):
	if action == "Traducir":
	return gr.update(visible=True)
	else:
	return gr.update(visible=False)

	action.change(update_language_dropdown, inputs=action, outputs=target_language)

	submit_button = gr.Button("Procesar")
	submit_button.click(process_file, inputs=[file, action, target_language], outputs=output_text)

	def generate_file():
	summary_text = output_text.value
	filename = 'translation.txt' if action.value == 'Traducir' else 'summary.txt'
	file_path = download_text(summary_text, filename)
	return file_path

	download_button = gr.Button("Descargar Resultado")
	download_button.click(
	fn=generate_file,
	inputs=[output_text],
	outputs=gr.File()
	)

	# Ejecutar la aplicación Gradio
	demo.launch(share=True)