File size: 31,544 Bytes
a018353 8f67d6f a018353 8f67d6f a018353 8f67d6f a018353 8f67d6f a018353 8f67d6f a018353 8f67d6f a018353 8f67d6f 6f89cef a018353 8f67d6f a018353 8f67d6f a018353 8f67d6f 6f89cef a018353 6f89cef a018353 8f67d6f a018353 8f67d6f a018353 6f89cef a018353 8f67d6f a018353 8f67d6f a018353 6f89cef 8f67d6f 6f89cef 8f67d6f 6f89cef a018353 8f67d6f a018353 8f67d6f 6f89cef 8f67d6f 6f89cef 8f67d6f 6f89cef 8f67d6f 6f89cef 8f67d6f 6f89cef 8f67d6f a018353 8f67d6f 6f89cef 8f67d6f 6f89cef 340946f b991cef 340946f a018353 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "c95ab233",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Adding package root to sys.path: /home/mafzaal/source/lets-talk/py-src\n",
"Current notebook directory: /home/mafzaal/source/lets-talk/py-src/notebooks\n",
"Project root: /home/mafzaal/source/lets-talk\n"
]
}
],
"source": [
"import sys\n",
"import os\n",
"\n",
"# Add the project root to the Python path\n",
"package_root = os.path.abspath(os.path.join(os.getcwd(), \"../\"))\n",
"print(f\"Adding package root to sys.path: {package_root}\")\n",
"if package_root not in sys.path:\n",
"\tsys.path.append(package_root)\n",
"\n",
"\n",
"notebook_dir = os.getcwd()\n",
"print(f\"Current notebook directory: {notebook_dir}\")\n",
"# change to the directory to the root of the project\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../\"))\n",
"print(f\"Project root: {project_root}\")\n",
"os.chdir(project_root)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "15e97530",
"metadata": {},
"outputs": [],
"source": [
"import nest_asyncio\n",
"nest_asyncio.apply()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b4f2ddc0",
"metadata": {},
"outputs": [],
"source": [
"import lets_talk.utils.blog as blog\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "123779af",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 14/14 [00:00<00:00, 3317.53it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 14 documents from data/\n",
"Split 14 documents into 162 chunks\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"docs = blog.load_blog_posts()\n",
"docs = blog.update_document_metadata(docs)\n",
"split_docs = blog.split_documents(docs)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0b742838",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.prompts import ChatPromptTemplate\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"qa_chat_model = ChatOpenAI(\n",
" model=\"gpt-4.1\",\n",
" temperature=0,\n",
")\n",
"\n",
"qa_prompt = \"\"\"\\\n",
"Given the following context, you must generate questions based on only the provided context.\n",
"You are to generate {n_questions} questions which should be provided in the following format:\n",
"\n",
"1. QUESTION #1\n",
"2. QUESTION #2\n",
"...\n",
"\n",
"Context:\n",
"{context}\n",
"\"\"\"\n",
"\n",
"qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)\n",
"\n",
"question_generation_chain = qa_prompt_template | qa_chat_model"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "5488c3d3",
"metadata": {},
"outputs": [],
"source": [
"import tqdm\n",
"import asyncio\n",
"\n",
"\n",
"def extract_questions(response_text,n_questions):\n",
" # Split the response text into lines\n",
" lines = response_text.strip().split('\\n')\n",
"\n",
" # Extract questions (format: \"1. QUESTION\")\n",
" extracted_questions = []\n",
" for line in lines:\n",
" line = line.strip()\n",
" if line and any(line.startswith(f\"{i}.\") for i in range(1, n_questions+1)):\n",
" # Remove the number prefix and get just the question\n",
" question = line.split('.', 1)[1].strip()\n",
" extracted_questions.append(question)\n",
"\n",
" return extracted_questions\n",
"\n",
"def create_questions(documents, n_questions, chain):\n",
" question_set = []\n",
" \n",
" for doc in tqdm.tqdm(documents):\n",
" \n",
" context = doc.page_content\n",
"\n",
" # Generate questions using the question generation chain\n",
" response = chain.invoke({\n",
" \"context\": context,\n",
" \"n_questions\": n_questions\n",
" })\n",
"\n",
" questions = extract_questions(response.content,n_questions)\n",
" \n",
" for i, question in enumerate(questions):\n",
" question_set.append({\"question\":question, \"context\": context})\n",
" \n",
" return question_set"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b1ece53b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 162/162 [07:23<00:00, 2.74s/it]\n"
]
}
],
"source": [
"ds = create_questions(documents=split_docs, n_questions=2, chain=question_generation_chain)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "965cf609",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "question",
"rawType": "object",
"type": "string"
},
{
"name": "context",
"rawType": "object",
"type": "string"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "f0615b27-42e5-4774-a436-51ec88bb4498",
"rows": [
[
"0",
"What role does Ragas play in evaluating the performance of applications that use Large Language Models (LLMs)?",
"---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
],
[
"1",
"Why is it important to have reliable metrics when building systems like question-answering tools or conversational agents with LLMs?",
"---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
],
[
"2",
"What are some of the key questions that Ragas helps answer when evaluating LLM applications?",
"## What is Ragas?\n\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\n\nAt its core, Ragas helps answer crucial questions:\n- Is my application retrieving the right information?\n- Are the responses factually accurate and consistent with the retrieved context?\n- Does the system appropriately address the user's query?\n- How well does my application handle multi-turn conversations?\n\n## Why Evaluate LLM Applications?\n\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable."
],
[
"3",
"Why is proper evaluation especially important for LLM applications in fields like healthcare, finance, or education?",
"## What is Ragas?\n\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\n\nAt its core, Ragas helps answer crucial questions:\n- Is my application retrieving the right information?\n- Are the responses factually accurate and consistent with the retrieved context?\n- Does the system appropriately address the user's query?\n- How well does my application handle multi-turn conversations?\n\n## Why Evaluate LLM Applications?\n\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable."
],
[
"4",
"What are the main purposes of evaluation as described in the context?",
"Evaluation serves several key purposes:\n- **Quality assurance**: Identify and fix issues before they reach users\n- **Performance tracking**: Monitor how changes impact system performance\n- **Benchmarking**: Compare different approaches objectively\n- **Continuous improvement**: Build feedback loops to enhance your application\n\n## Key Features of Ragas\n\n### 🎯 Specialized Metrics\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\n\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\n- **Answer Relevancy**: Assesses if the response addresses the user's question\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic"
],
[
"5",
"Which specialized metrics does Ragas provide for evaluating LLM applications, and what does each metric measure?",
"Evaluation serves several key purposes:\n- **Quality assurance**: Identify and fix issues before they reach users\n- **Performance tracking**: Monitor how changes impact system performance\n- **Benchmarking**: Compare different approaches objectively\n- **Continuous improvement**: Build feedback loops to enhance your application\n\n## Key Features of Ragas\n\n### 🎯 Specialized Metrics\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\n\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\n- **Answer Relevancy**: Assesses if the response addresses the user's question\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic"
],
[
"6",
"How does Ragas assist in the process of test data generation for evaluation?",
"### 🧪 Test Data Generation\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\n\n### 🔗 Seamless Integrations\nRagas works with popular LLM frameworks and tools:\n- [LangChain](https://www.langchain.com/)\n- [LlamaIndex](https://www.llamaindex.ai/)\n- [Haystack](https://haystack.deepset.ai/)\n- [OpenAI](https://openai.com/)\n\nObservability platforms \n- [Phoenix](https://phoenix.arize.com/)\n- [LangSmith](https://python.langchain.com/docs/introduction/)\n- [Langfuse](https://www.langfuse.com/)\n\n### 📊 Comprehensive Analysis\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\n\n## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:"
],
[
"7",
"Which popular LLM frameworks and observability platforms does Ragas integrate with?",
"### 🧪 Test Data Generation\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\n\n### 🔗 Seamless Integrations\nRagas works with popular LLM frameworks and tools:\n- [LangChain](https://www.langchain.com/)\n- [LlamaIndex](https://www.llamaindex.ai/)\n- [Haystack](https://haystack.deepset.ai/)\n- [OpenAI](https://openai.com/)\n\nObservability platforms \n- [Phoenix](https://phoenix.arize.com/)\n- [LangSmith](https://python.langchain.com/docs/introduction/)\n- [Langfuse](https://www.langfuse.com/)\n\n### 📊 Comprehensive Analysis\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\n\n## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:"
],
[
"8",
"What command is used to install Ragas according to the provided context?",
"## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:\n\n```python\nfrom ragas.metrics import Faithfulness\nfrom ragas.evaluation import EvaluationDataset\nfrom ragas.dataset_schema import SingleTurnSample\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\nfrom langchain_openai import ChatOpenAI\n\n# Initialize the LLM, you are going to new OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \n\n# Your evaluation data\ntest_data = {\n \"user_input\": \"What is the capital of France?\",\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\n \"response\": \"The capital of France is Paris.\"\n}\n\n# Create a sample\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor"
],
[
"9",
"In the example, which class is used to wrap the ChatOpenAI model for evaluation purposes?",
"## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:\n\n```python\nfrom ragas.metrics import Faithfulness\nfrom ragas.evaluation import EvaluationDataset\nfrom ragas.dataset_schema import SingleTurnSample\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\nfrom langchain_openai import ChatOpenAI\n\n# Initialize the LLM, you are going to new OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \n\n# Your evaluation data\ntest_data = {\n \"user_input\": \"What is the capital of France?\",\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\n \"response\": \"The capital of France is Paris.\"\n}\n\n# Create a sample\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor"
]
],
"shape": {
"columns": 2,
"rows": 10
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>question</th>\n",
" <th>context</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What role does Ragas play in evaluating the pe...</td>\n",
" <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Why is it important to have reliable metrics w...</td>\n",
" <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What are some of the key questions that Ragas ...</td>\n",
" <td>## What is Ragas?\\n\\n[Ragas](https://docs.raga...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Why is proper evaluation especially important ...</td>\n",
" <td>## What is Ragas?\\n\\n[Ragas](https://docs.raga...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>What are the main purposes of evaluation as de...</td>\n",
" <td>Evaluation serves several key purposes:\\n- **Q...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Which specialized metrics does Ragas provide f...</td>\n",
" <td>Evaluation serves several key purposes:\\n- **Q...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>How does Ragas assist in the process of test d...</td>\n",
" <td>### 🧪 Test Data Generation\\nCreating high-qual...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Which popular LLM frameworks and observability...</td>\n",
" <td>### 🧪 Test Data Generation\\nCreating high-qual...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>What command is used to install Ragas accordin...</td>\n",
" <td>## Getting Started with Ragas\\n\\nInstalling Ra...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>In the example, which class is used to wrap th...</td>\n",
" <td>## Getting Started with Ragas\\n\\nInstalling Ra...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" question \\\n",
"0 What role does Ragas play in evaluating the pe... \n",
"1 Why is it important to have reliable metrics w... \n",
"2 What are some of the key questions that Ragas ... \n",
"3 Why is proper evaluation especially important ... \n",
"4 What are the main purposes of evaluation as de... \n",
"5 Which specialized metrics does Ragas provide f... \n",
"6 How does Ragas assist in the process of test d... \n",
"7 Which popular LLM frameworks and observability... \n",
"8 What command is used to install Ragas accordin... \n",
"9 In the example, which class is used to wrap th... \n",
"\n",
" context \n",
"0 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
"1 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
"2 ## What is Ragas?\\n\\n[Ragas](https://docs.raga... \n",
"3 ## What is Ragas?\\n\\n[Ragas](https://docs.raga... \n",
"4 Evaluation serves several key purposes:\\n- **Q... \n",
"5 Evaluation serves several key purposes:\\n- **Q... \n",
"6 ### 🧪 Test Data Generation\\nCreating high-qual... \n",
"7 ### 🧪 Test Data Generation\\nCreating high-qual... \n",
"8 ## Getting Started with Ragas\\n\\nInstalling Ra... \n",
"9 ## Getting Started with Ragas\\n\\nInstalling Ra... "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.DataFrame(ds)\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "b8c025fa",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"evals/ft_questions.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eca8e3c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset has 324 examples\n",
"Dataset features: {'question': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None)}\n",
"\n",
"Sample examples:\n"
]
},
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "question",
"rawType": "object",
"type": "string"
},
{
"name": "context",
"rawType": "object",
"type": "string"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "c895ca53-ec90-48b0-a20e-13957de25913",
"rows": [
[
"0",
"What role does Ragas play in evaluating the performance of applications that use Large Language Models (LLMs)?",
"---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
],
[
"1",
"Why is it important to have reliable metrics when building systems like question-answering tools or conversational agents with LLMs?",
"---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
],
[
"2",
"What are some of the key questions that Ragas helps answer when evaluating LLM applications?",
"## What is Ragas?\n\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\n\nAt its core, Ragas helps answer crucial questions:\n- Is my application retrieving the right information?\n- Are the responses factually accurate and consistent with the retrieved context?\n- Does the system appropriately address the user's query?\n- How well does my application handle multi-turn conversations?\n\n## Why Evaluate LLM Applications?\n\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable."
]
],
"shape": {
"columns": 2,
"rows": 3
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>question</th>\n",
" <th>context</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What role does Ragas play in evaluating the pe...</td>\n",
" <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Why is it important to have reliable metrics w...</td>\n",
" <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What are some of the key questions that Ragas ...</td>\n",
" <td>## What is Ragas?\\n\\n[Ragas](https://docs.raga...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" question \\\n",
"0 What role does Ragas play in evaluating the pe... \n",
"1 Why is it important to have reliable metrics w... \n",
"2 What are some of the key questions that Ragas ... \n",
"\n",
" context \n",
"0 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
"1 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
"2 ## What is Ragas?\\n\\n[Ragas](https://docs.raga... "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dddd54d430094f5a906d9483abf892e4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/1 shards): 0%| | 0/324 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from datasets import Dataset\n",
"\n",
"# Convert pandas DataFrame to Huggingface Dataset\n",
"hf_dataset = Dataset.from_pandas(df)\n",
"\n",
"# Display some basic information about the dataset\n",
"print(f\"Dataset has {len(hf_dataset)} examples\")\n",
"print(f\"Dataset features: {hf_dataset.features}\")\n",
"\n",
"# Show a few examples\n",
"print(\"\\nSample examples:\")\n",
"display(hf_dataset.select(range(3)).to_pandas())\n",
"\n",
"# Save the dataset to disk (optional)\n",
"#hf_dataset.save_to_disk(\"ragas_qa_dataset\")\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "f9e533d5",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "082ffb813f344f0ca18a9cb936c97dc1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5e914dddebb74c8b9ac6e311ecbf0716",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/datasets/mafzaal/thedataguy_embed_ft/commit/963348381fcb929a7367ff8933b62812a0e9ceb7', commit_message='Upload dataset', commit_description='', oid='963348381fcb929a7367ff8933b62812a0e9ceb7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mafzaal/thedataguy_embed_ft', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mafzaal/thedataguy_embed_ft'), pr_revision=None, pr_num=None)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hf_dataset.push_to_hub(\n",
" repo_id=\"mafzaal/thedataguy_embed_ft\" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|