File size: 7,691 Bytes
666d29c
 
 
 
 
 
 
 
 
 
 
00c003d
666d29c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00c003d
666d29c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00c003d
666d29c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00c003d
666d29c
 
 
 
 
 
 
 
 
 
780cc24
666d29c
 
 
 
 
 
 
 
 
 
00c003d
666d29c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
!pip install nomic
!pip install --upgrade langchain

! nomic login

! nomic login nk-bqukmTuFJHW8tgXzXXBw1qDL062-pth-ACecKP7CkXs

! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain
"""


import json
from langchain_community.document_loaders import JSONLoader
from langchain.docstore.document import Document

# Define el JSONLoader para cargar y procesar cada mensaje del JSON
class JSONLoader:
    def __init__(self, message):
        self.message = message

    def load(self):
        # Crear una instancia de Document con el contenido y metadata adecuada
        return Document(
            page_content=self.message['content'],
            metadata={
                'role': self.message['role'],
                'conversation_id': self.message['conversation_id'],
                'message_id': self.message['message_id']
            }
        )

# Cargar el archivo JSON
file_path = 'RAG_Datos.json'  # Asegúrate de que esta ruta sea correcta

with open(file_path, 'r') as file:
    data = json.load(file)

# Procesar los mensajes y crear los documentos
docs_list = []
for conversation in data:
    for message in conversation['messages']:
        docs_list.append(JSONLoader(message).load())

# Verificar el contenido (opcional)
for doc in docs_list:
    print(doc.page_content, doc.metadata)


"""
## Splitting

Long context retrieval,
Chunck_size -> tamaño de cada texto
"""

# Ahora puedes usar docs_list con text_splitter
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size=7500, chunk_overlap=100
)
doc_splits = text_splitter.split_documents(docs_list)

# Verificar el contenido de los splits (opcional)
for split in doc_splits:
    print(split.page_content, split.metadata)


import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
for d in doc_splits:
    print("The document is %s tokens" % len(encoding.encode(d.page_content)))

"""## Index

Nomic embeddings [here](https://docs.nomic.ai/reference/endpoints/nomic-embed-text).
"""

import os

from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_nomic import NomicEmbeddings
from langchain_nomic.embeddings import NomicEmbeddings

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
retriever = vectorstore.as_retriever()

# RAG Chain


import os
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.translate.bleu_score import corpus_bleu
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain

# Configurar la clave de API como variable de entorno
os.environ['OPENAI_API_KEY'] = 'XXXX'

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM API
model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

# Placeholder para `retriever`
class DummyRetriever:
    def __call__(self, *args, **kwargs):
        return {"context": "This is a test context"}

retriever = DummyRetriever()

# Crear una cadena LLM
llm_chain = LLMChain(
    prompt=prompt,
    llm=model,
)

# Datos de prueba
test_data = [
    {"context": "Write a Python function to sum all prime numbers up to 1000.", "question": "How to write a function to sum all prime numbers up to 1000?", "expected_answer": "def sum_primes(limit):\n    def is_prime(n):\n        if n <= 1:\n            return False\n        for i in range(2, int(n**0.5) + 1):\n            if n % i == 0:\n                return False\n        return True\n    return sum(x for x in range(limit) if is_prime(x))\n\nprint(sum_primes(1000))"},
    {"context": "Write a Python function to calculate the factorial of a number.", "question": "How to write a function to calculate the factorial of a number?", "expected_answer": "def factorial(n):\n    if n == 0:\n        return 1\n    else:\n        return n * factorial(n-1)\n\nprint(factorial(5))"},
    {"context": "Write a Python function to check if a number is palindrome.", "question": "How to write a function to check if a number is palindrome?", "expected_answer": "def is_palindrome(n):\n    return str(n) == str(n)[::-1]\n\nprint(is_palindrome(121))"},
    {"context": "Write a Python function to generate Fibonacci sequence up to n.", "question": "How to write a function to generate Fibonacci sequence up to n?", "expected_answer": "def fibonacci(n):\n    fib_sequence = [0, 1]\n    while len(fib_sequence) < n:\n        fib_sequence.append(fib_sequence[-1] + fib_sequence[-2])\n    return fib_sequence\n\nprint(fibonacci(10))"},
    {"context": "Write a Python function to find the greatest common divisor (GCD) of two numbers.", "question": "How to write a function to find the greatest common divisor (GCD) of two numbers?", "expected_answer": "def gcd(a, b):\n    while b:\n        a, b = b, a % b\n    return a\n\nprint(gcd(48, 18))"},
    {"context": "Write a Python function to check if a string is an anagram of another string.", "question": "How to write a function to check if a string is an anagram of another string?", "expected_answer": "def is_anagram(str1, str2):\n    return sorted(str1) == sorted(str2)\n\nprint(is_anagram('listen', 'silent'))"},
    {"context": "Write a Python function to find the maximum element in a list.", "question": "How to write a function to find the maximum element in a list?", "expected_answer": "def find_max(lst):\n    return max(lst)\n\nprint(find_max([3, 5, 7, 2, 8]))"},
    {"context": "Write a Python function to reverse a string.", "question": "How to write a function to reverse a string?", "expected_answer": "def reverse_string(s):\n    return s[::-1]\n\nprint(reverse_string('hello'))"},
    {"context": "Write a Python function to merge two sorted lists.", "question": "How to write a function to merge two sorted lists?", "expected_answer": "def merge_sorted_lists(lst1, lst2):\n    return sorted(lst1 + lst2)\n\nprint(merge_sorted_lists([1, 3, 5], [2, 4, 6]))"},
    {"context": "Write a Python function to remove duplicates from a list.", "question": "How to write a function to remove duplicates from a list?", "expected_answer": "def remove_duplicates(lst):\n    return list(set(lst))\n\nprint(remove_duplicates([1, 2, 2, 3, 4, 4, 5]))"},
]

# Evaluar la precisión, recall y F1-score de la recuperación
retrieved_contexts = [retriever()["context"] for _ in test_data]
expected_contexts = [item["context"] for item in test_data]
precision = precision_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1)
recall = recall_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1)
f1 = f1_score(expected_contexts, retrieved_contexts, average='macro')

print(f"Retrieval Precision: {precision}")
print(f"Retrieval Recall: {recall}")
print(f"Retrieval F1 Score: {f1}")

# Evaluar la generación de respuestas
generated_answers = []
for item in test_data:
    output = llm_chain.run({"context": item["context"], "question": item["question"]})
    generated_answers.append(output)

# BLEU Score
reference_answers = [[item["expected_answer"].split()] for item in test_data]
generated_answers_tokens = [answer.split() for answer in generated_answers]
bleu_score = corpus_bleu(reference_answers, generated_answers_tokens)

print(f"BLEU Score: {bleu_score}")