""" !pip install nomic !pip install --upgrade langchain ! nomic login ! nomic login nk-bqukmTuFJHW8tgXzXXBw1qDL062-pth-ACecKP7CkXs ! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain """ import json from langchain_community.document_loaders import JSONLoader from langchain.docstore.document import Document # Define el JSONLoader para cargar y procesar cada mensaje del JSON class JSONLoader: def __init__(self, message): self.message = message def load(self): # Crear una instancia de Document con el contenido y metadata adecuada return Document( page_content=self.message['content'], metadata={ 'role': self.message['role'], 'conversation_id': self.message['conversation_id'], 'message_id': self.message['message_id'] } ) # Cargar el archivo JSON file_path = 'RAG_Datos.json' # Asegúrate de que esta ruta sea correcta with open(file_path, 'r') as file: data = json.load(file) # Procesar los mensajes y crear los documentos docs_list = [] for conversation in data: for message in conversation['messages']: docs_list.append(JSONLoader(message).load()) # Verificar el contenido (opcional) for doc in docs_list: print(doc.page_content, doc.metadata) """ ## Splitting Long context retrieval, Chunck_size -> tamaño de cada texto """ # Ahora puedes usar docs_list con text_splitter from langchain.text_splitter import CharacterTextSplitter text_splitter = CharacterTextSplitter( chunk_size=7500, chunk_overlap=100 ) doc_splits = text_splitter.split_documents(docs_list) # Verificar el contenido de los splits (opcional) for split in doc_splits: print(split.page_content, split.metadata) import tiktoken encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") for d in doc_splits: print("The document is %s tokens" % len(encoding.encode(d.page_content))) """## Index Nomic embeddings [here](https://docs.nomic.ai/reference/endpoints/nomic-embed-text). """ import os from langchain_community.vectorstores import Chroma from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnableLambda, RunnablePassthrough from langchain_nomic import NomicEmbeddings from langchain_nomic.embeddings import NomicEmbeddings # Add to vectorDB vectorstore = Chroma.from_documents( documents=doc_splits, collection_name="rag-chroma", embedding=NomicEmbeddings(model="nomic-embed-text-v1"), ) retriever = vectorstore.as_retriever() # RAG Chain import os from sklearn.metrics import precision_score, recall_score, f1_score from nltk.translate.bleu_score import corpus_bleu from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI from langchain.chains import LLMChain # Configurar la clave de API como variable de entorno os.environ['OPENAI_API_KEY'] = 'XXXX' # Prompt template = """Answer the question based only on the following context: {context} Question: {question} """ prompt = ChatPromptTemplate.from_template(template) # LLM API model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo") # Placeholder para `retriever` class DummyRetriever: def __call__(self, *args, **kwargs): return {"context": "This is a test context"} retriever = DummyRetriever() # Crear una cadena LLM llm_chain = LLMChain( prompt=prompt, llm=model, ) # Datos de prueba test_data = [ {"context": "Write a Python function to sum all prime numbers up to 1000.", "question": "How to write a function to sum all prime numbers up to 1000?", "expected_answer": "def sum_primes(limit):\n def is_prime(n):\n if n <= 1:\n return False\n for i in range(2, int(n**0.5) + 1):\n if n % i == 0:\n return False\n return True\n return sum(x for x in range(limit) if is_prime(x))\n\nprint(sum_primes(1000))"}, {"context": "Write a Python function to calculate the factorial of a number.", "question": "How to write a function to calculate the factorial of a number?", "expected_answer": "def factorial(n):\n if n == 0:\n return 1\n else:\n return n * factorial(n-1)\n\nprint(factorial(5))"}, {"context": "Write a Python function to check if a number is palindrome.", "question": "How to write a function to check if a number is palindrome?", "expected_answer": "def is_palindrome(n):\n return str(n) == str(n)[::-1]\n\nprint(is_palindrome(121))"}, {"context": "Write a Python function to generate Fibonacci sequence up to n.", "question": "How to write a function to generate Fibonacci sequence up to n?", "expected_answer": "def fibonacci(n):\n fib_sequence = [0, 1]\n while len(fib_sequence) < n:\n fib_sequence.append(fib_sequence[-1] + fib_sequence[-2])\n return fib_sequence\n\nprint(fibonacci(10))"}, {"context": "Write a Python function to find the greatest common divisor (GCD) of two numbers.", "question": "How to write a function to find the greatest common divisor (GCD) of two numbers?", "expected_answer": "def gcd(a, b):\n while b:\n a, b = b, a % b\n return a\n\nprint(gcd(48, 18))"}, {"context": "Write a Python function to check if a string is an anagram of another string.", "question": "How to write a function to check if a string is an anagram of another string?", "expected_answer": "def is_anagram(str1, str2):\n return sorted(str1) == sorted(str2)\n\nprint(is_anagram('listen', 'silent'))"}, {"context": "Write a Python function to find the maximum element in a list.", "question": "How to write a function to find the maximum element in a list?", "expected_answer": "def find_max(lst):\n return max(lst)\n\nprint(find_max([3, 5, 7, 2, 8]))"}, {"context": "Write a Python function to reverse a string.", "question": "How to write a function to reverse a string?", "expected_answer": "def reverse_string(s):\n return s[::-1]\n\nprint(reverse_string('hello'))"}, {"context": "Write a Python function to merge two sorted lists.", "question": "How to write a function to merge two sorted lists?", "expected_answer": "def merge_sorted_lists(lst1, lst2):\n return sorted(lst1 + lst2)\n\nprint(merge_sorted_lists([1, 3, 5], [2, 4, 6]))"}, {"context": "Write a Python function to remove duplicates from a list.", "question": "How to write a function to remove duplicates from a list?", "expected_answer": "def remove_duplicates(lst):\n return list(set(lst))\n\nprint(remove_duplicates([1, 2, 2, 3, 4, 4, 5]))"}, ] # Evaluar la precisión, recall y F1-score de la recuperación retrieved_contexts = [retriever()["context"] for _ in test_data] expected_contexts = [item["context"] for item in test_data] precision = precision_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1) recall = recall_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1) f1 = f1_score(expected_contexts, retrieved_contexts, average='macro') print(f"Retrieval Precision: {precision}") print(f"Retrieval Recall: {recall}") print(f"Retrieval F1 Score: {f1}") # Evaluar la generación de respuestas generated_answers = [] for item in test_data: output = llm_chain.run({"context": item["context"], "question": item["question"]}) generated_answers.append(output) # BLEU Score reference_answers = [[item["expected_answer"].split()] for item in test_data] generated_answers_tokens = [answer.split() for answer in generated_answers] bleu_score = corpus_bleu(reference_answers, generated_answers_tokens) print(f"BLEU Score: {bleu_score}")