File size: 7,691 Bytes
666d29c 00c003d 666d29c 00c003d 666d29c 00c003d 666d29c 00c003d 666d29c 780cc24 666d29c 00c003d 666d29c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
"""
!pip install nomic
!pip install --upgrade langchain
! nomic login
! nomic login nk-bqukmTuFJHW8tgXzXXBw1qDL062-pth-ACecKP7CkXs
! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain
"""
import json
from langchain_community.document_loaders import JSONLoader
from langchain.docstore.document import Document
# Define el JSONLoader para cargar y procesar cada mensaje del JSON
class JSONLoader:
def __init__(self, message):
self.message = message
def load(self):
# Crear una instancia de Document con el contenido y metadata adecuada
return Document(
page_content=self.message['content'],
metadata={
'role': self.message['role'],
'conversation_id': self.message['conversation_id'],
'message_id': self.message['message_id']
}
)
# Cargar el archivo JSON
file_path = 'RAG_Datos.json' # Asegúrate de que esta ruta sea correcta
with open(file_path, 'r') as file:
data = json.load(file)
# Procesar los mensajes y crear los documentos
docs_list = []
for conversation in data:
for message in conversation['messages']:
docs_list.append(JSONLoader(message).load())
# Verificar el contenido (opcional)
for doc in docs_list:
print(doc.page_content, doc.metadata)
"""
## Splitting
Long context retrieval,
Chunck_size -> tamaño de cada texto
"""
# Ahora puedes usar docs_list con text_splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
chunk_size=7500, chunk_overlap=100
)
doc_splits = text_splitter.split_documents(docs_list)
# Verificar el contenido de los splits (opcional)
for split in doc_splits:
print(split.page_content, split.metadata)
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
for d in doc_splits:
print("The document is %s tokens" % len(encoding.encode(d.page_content)))
"""## Index
Nomic embeddings [here](https://docs.nomic.ai/reference/endpoints/nomic-embed-text).
"""
import os
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_nomic import NomicEmbeddings
from langchain_nomic.embeddings import NomicEmbeddings
# Add to vectorDB
vectorstore = Chroma.from_documents(
documents=doc_splits,
collection_name="rag-chroma",
embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
retriever = vectorstore.as_retriever()
# RAG Chain
import os
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.translate.bleu_score import corpus_bleu
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
# Configurar la clave de API como variable de entorno
os.environ['OPENAI_API_KEY'] = 'XXXX'
# Prompt
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
# LLM API
model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
# Placeholder para `retriever`
class DummyRetriever:
def __call__(self, *args, **kwargs):
return {"context": "This is a test context"}
retriever = DummyRetriever()
# Crear una cadena LLM
llm_chain = LLMChain(
prompt=prompt,
llm=model,
)
# Datos de prueba
test_data = [
{"context": "Write a Python function to sum all prime numbers up to 1000.", "question": "How to write a function to sum all prime numbers up to 1000?", "expected_answer": "def sum_primes(limit):\n def is_prime(n):\n if n <= 1:\n return False\n for i in range(2, int(n**0.5) + 1):\n if n % i == 0:\n return False\n return True\n return sum(x for x in range(limit) if is_prime(x))\n\nprint(sum_primes(1000))"},
{"context": "Write a Python function to calculate the factorial of a number.", "question": "How to write a function to calculate the factorial of a number?", "expected_answer": "def factorial(n):\n if n == 0:\n return 1\n else:\n return n * factorial(n-1)\n\nprint(factorial(5))"},
{"context": "Write a Python function to check if a number is palindrome.", "question": "How to write a function to check if a number is palindrome?", "expected_answer": "def is_palindrome(n):\n return str(n) == str(n)[::-1]\n\nprint(is_palindrome(121))"},
{"context": "Write a Python function to generate Fibonacci sequence up to n.", "question": "How to write a function to generate Fibonacci sequence up to n?", "expected_answer": "def fibonacci(n):\n fib_sequence = [0, 1]\n while len(fib_sequence) < n:\n fib_sequence.append(fib_sequence[-1] + fib_sequence[-2])\n return fib_sequence\n\nprint(fibonacci(10))"},
{"context": "Write a Python function to find the greatest common divisor (GCD) of two numbers.", "question": "How to write a function to find the greatest common divisor (GCD) of two numbers?", "expected_answer": "def gcd(a, b):\n while b:\n a, b = b, a % b\n return a\n\nprint(gcd(48, 18))"},
{"context": "Write a Python function to check if a string is an anagram of another string.", "question": "How to write a function to check if a string is an anagram of another string?", "expected_answer": "def is_anagram(str1, str2):\n return sorted(str1) == sorted(str2)\n\nprint(is_anagram('listen', 'silent'))"},
{"context": "Write a Python function to find the maximum element in a list.", "question": "How to write a function to find the maximum element in a list?", "expected_answer": "def find_max(lst):\n return max(lst)\n\nprint(find_max([3, 5, 7, 2, 8]))"},
{"context": "Write a Python function to reverse a string.", "question": "How to write a function to reverse a string?", "expected_answer": "def reverse_string(s):\n return s[::-1]\n\nprint(reverse_string('hello'))"},
{"context": "Write a Python function to merge two sorted lists.", "question": "How to write a function to merge two sorted lists?", "expected_answer": "def merge_sorted_lists(lst1, lst2):\n return sorted(lst1 + lst2)\n\nprint(merge_sorted_lists([1, 3, 5], [2, 4, 6]))"},
{"context": "Write a Python function to remove duplicates from a list.", "question": "How to write a function to remove duplicates from a list?", "expected_answer": "def remove_duplicates(lst):\n return list(set(lst))\n\nprint(remove_duplicates([1, 2, 2, 3, 4, 4, 5]))"},
]
# Evaluar la precisión, recall y F1-score de la recuperación
retrieved_contexts = [retriever()["context"] for _ in test_data]
expected_contexts = [item["context"] for item in test_data]
precision = precision_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1)
recall = recall_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1)
f1 = f1_score(expected_contexts, retrieved_contexts, average='macro')
print(f"Retrieval Precision: {precision}")
print(f"Retrieval Recall: {recall}")
print(f"Retrieval F1 Score: {f1}")
# Evaluar la generación de respuestas
generated_answers = []
for item in test_data:
output = llm_chain.run({"context": item["context"], "question": item["question"]})
generated_answers.append(output)
# BLEU Score
reference_answers = [[item["expected_answer"].split()] for item in test_data]
generated_answers_tokens = [answer.split() for answer in generated_answers]
bleu_score = corpus_bleu(reference_answers, generated_answers_tokens)
print(f"BLEU Score: {bleu_score}") |