import streamlit as st from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification from langchain_community.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from langchain_community.embeddings import HuggingFaceEmbeddings from PyPDF2 import PdfReader from docx import Document import csv import json import torch from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from huggingface_hub import login # Autenticación en Hugging Face huggingface_token = st.secrets["HUGGINGFACE_TOKEN"] login(huggingface_token) # Configurar modelo y tokenizador model_name = 'Qwen/Qwen2-1.5B' model_config = AutoConfig.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" text_generation_pipeline = pipeline( model=model_name, tokenizer=tokenizer, task="text-generation", temperature=0.2, repetition_penalty=1.1, return_full_text=True, max_new_tokens=1000, ) prompt_template = """ ### [INST] Instruction: Answer the question based on your knowledge. Here is context to help: {context} ### QUESTION: {question} [/INST] """ mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline) # Crear el prompt desde la plantilla de prompt prompt = PromptTemplate( input_variables=["context", "question"], template=prompt_template, ) # Crear la cadena LLM llm_chain = LLMChain(llm=mistral_llm, prompt=prompt) # Función para manejar archivos subidos def handle_uploaded_file(uploaded_file): try: if uploaded_file.name.endswith(".txt"): text = uploaded_file.read().decode("utf-8") elif uploaded_file.name.endswith(".pdf"): reader = PdfReader(uploaded_file) text = "" for page in range(len(reader.pages)): text += reader.pages[page].extract_text() elif uploaded_file.name.endswith(".docx"): doc = Document(uploaded_file) text = "\n".join([para.text for para in doc.paragraphs]) elif uploaded_file.name.endswith(".csv"): text = "" content = uploaded_file.read().decode("utf-8").splitlines() reader = csv.reader(content) text = " ".join([" ".join(row) for row in reader]) elif uploaded_file.name.endswith(".json"): data = json.load(uploaded_file) text = json.dumps(data, indent=4) else: text = "Tipo de archivo no soportado." return text except Exception as e: return str(e) # Función para traducir texto def translate(text, target_language): context = "" question = f"Por favor, traduzca el siguiente documento al {target_language}:\n{text}\nAsegúrese de que la traducción sea precisa y conserve el significado original del documento." response = llm_chain.run(context=context, question=question) return response # Función para resumir texto def summarize(text, length): context = "" question = f"Por favor, haga un resumen {length} del siguiente documento:\n{text}\nAsegúrese de que el resumen sea conciso y conserve el significado original del documento." response = llm_chain.run(context=context, question=question) return response # Configuración del modelo de clasificación @st.cache_resource def load_classification_model(): tokenizer_cls = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") model_cls = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") return model_cls, tokenizer_cls classification_model, classification_tokenizer = load_classification_model() id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"} def classify_text(text): inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length") classification_model.eval() with torch.no_grad(): outputs = classification_model(**inputs) logits = outputs.logits predicted_class_id = logits.argmax(dim=-1).item() predicted_label = id2label[predicted_class_id] return predicted_label # Función para cargar documentos JSON def load_json_documents(category): try: with open(f"./{category}.json", "r", encoding="utf-8") as f: data = json.load(f)["questions_and_answers"] documents = [entry["question"] + " " + entry["answer"] for entry in data] return documents except FileNotFoundError: return [] # Configuración de FAISS y embeddings @st.cache_resource def create_vector_store(docs): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"}) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) split_docs = text_splitter.split_text(docs) vector_store = FAISS.from_texts(split_docs, embeddings) return vector_store def explain_text(user_input, document_text): classification = classify_text(document_text) if classification in ["multas", "politicas_de_privacidad", "contratos", "denuncias"]: docs = load_json_documents(classification) if docs: vector_store = create_vector_store(docs) search_docs = vector_store.similarity_search(user_input) context = " ".join([doc.page_content for doc in search_docs]) else: context = "" else: context = "" question = user_input response = llm_chain.run(context=context, question=question) return response def main(): st.title("LexAIcon") st.write("Puedes conversar con este chatbot basado en Mistral-7B-Instruct y subir archivos para que el chatbot los procese.") with st.sidebar: st.caption("[Consigue un HuggingFace Token](https://huggingface.co/settings/tokens)") operation = st.radio("Selecciona una operación", ["Resumir", "Traducir", "Explicar"]) if operation == "Explicar": user_input = st.text_area("Introduce tu pregunta:", "") uploaded_file = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"]) if uploaded_file and user_input: document_text = handle_uploaded_file(uploaded_file) bot_response = explain_text(user_input, document_text) st.write(f"**Assistant:** {bot_response}") else: uploaded_file = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"]) if uploaded_file: document_text = handle_uploaded_file(uploaded_file) if operation == "Traducir": target_language = st.selectbox("Selecciona el idioma de traducción", ["español", "inglés", "francés", "alemán"]) if target_language: bot_response = translate(document_text, target_language) st.write(f"**Assistant:** {bot_response}") elif operation == "Resumir": summary_length = st.selectbox("Selecciona la longitud del resumen", ["corto", "medio", "largo"]) if summary_length: if summary_length == "corto": length = "de aproximadamente 50 palabras" elif summary_length == "medio": length = "de aproximadamente 100 palabras" elif summary_length == "largo": length = "de aproximadamente 500 palabras" bot_response = summarize(document_text, length) st.write(f"**Assistant:** {bot_response}") if __name__ == "__main__": main()