Created using Colab
Browse files- notebooks/06-Evaluate_RAG.ipynb +20 -12
notebooks/06-Evaluate_RAG.ipynb
CHANGED
@@ -21,7 +21,7 @@
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
-
"execution_count":
|
25 |
"metadata": {
|
26 |
"id": "QPJzr-I9XQ7l"
|
27 |
},
|
@@ -32,7 +32,7 @@
|
|
32 |
},
|
33 |
{
|
34 |
"cell_type": "code",
|
35 |
-
"execution_count":
|
36 |
"metadata": {
|
37 |
"id": "riuXwpSPcvWC"
|
38 |
},
|
@@ -46,7 +46,7 @@
|
|
46 |
},
|
47 |
{
|
48 |
"cell_type": "code",
|
49 |
-
"execution_count":
|
50 |
"metadata": {
|
51 |
"id": "km-KQOrgr3VB"
|
52 |
},
|
@@ -70,7 +70,7 @@
|
|
70 |
},
|
71 |
{
|
72 |
"cell_type": "code",
|
73 |
-
"execution_count":
|
74 |
"metadata": {
|
75 |
"id": "SQP87lHczHKc"
|
76 |
},
|
@@ -504,7 +504,7 @@
|
|
504 |
},
|
505 |
{
|
506 |
"cell_type": "code",
|
507 |
-
"execution_count":
|
508 |
"metadata": {
|
509 |
"id": "mNDd5i921Hww"
|
510 |
},
|
@@ -632,7 +632,7 @@
|
|
632 |
},
|
633 |
{
|
634 |
"cell_type": "code",
|
635 |
-
"execution_count":
|
636 |
"metadata": {
|
637 |
"colab": {
|
638 |
"base_uri": "https://localhost:8080/"
|
@@ -662,27 +662,35 @@
|
|
662 |
"from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner\n",
|
663 |
"from llama_index.llms.openai import OpenAI\n",
|
664 |
"\n",
|
|
|
665 |
"llm_gpt4 = OpenAI(temperature=0, model=\"gpt-4o\")\n",
|
666 |
"\n",
|
|
|
667 |
"faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4)\n",
|
668 |
"relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4)\n",
|
669 |
"\n",
|
670 |
-
"#
|
671 |
"queries = list(rag_eval_dataset.queries.values())\n",
|
|
|
672 |
"batch_eval_queries = queries[:20]\n",
|
673 |
"\n",
|
|
|
674 |
"runner = BatchEvalRunner(\n",
|
675 |
"{\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n",
|
676 |
"workers=32,\n",
|
677 |
")\n",
|
678 |
"\n",
|
|
|
679 |
"for i in [2, 4, 6, 8, 10]:\n",
|
680 |
-
" # Set
|
681 |
" query_engine = index.as_query_engine(similarity_top_k=i)\n",
|
682 |
"\n",
|
|
|
683 |
" eval_results = await runner.aevaluate_queries(\n",
|
684 |
" query_engine, queries=batch_eval_queries\n",
|
685 |
" )\n",
|
|
|
|
|
686 |
" faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])\n",
|
687 |
" print(f\"top_{i} faithfulness_score: {faithfulness_score}\")\n",
|
688 |
"\n",
|
@@ -701,7 +709,7 @@
|
|
701 |
},
|
702 |
{
|
703 |
"cell_type": "code",
|
704 |
-
"execution_count":
|
705 |
"metadata": {
|
706 |
"id": "aUulxzuh1Hwx"
|
707 |
},
|
@@ -743,7 +751,7 @@
|
|
743 |
"metadata": {
|
744 |
"id": "CYIjkAP74bly"
|
745 |
},
|
746 |
-
"execution_count":
|
747 |
"outputs": []
|
748 |
},
|
749 |
{
|
@@ -758,7 +766,7 @@
|
|
758 |
"id": "-3b-bgvA4dAz",
|
759 |
"outputId": "7ced2102-6372-4794-82ad-1c7e60438088"
|
760 |
},
|
761 |
-
"execution_count":
|
762 |
"outputs": [
|
763 |
{
|
764 |
"output_type": "execute_result",
|
@@ -785,7 +793,7 @@
|
|
785 |
"id": "KNEhRQAo4dT0",
|
786 |
"outputId": "4a5d7db9-b399-49ea-c90e-b1e076640a92"
|
787 |
},
|
788 |
-
"execution_count":
|
789 |
"outputs": [
|
790 |
{
|
791 |
"output_type": "execute_result",
|
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
+
"execution_count": null,
|
25 |
"metadata": {
|
26 |
"id": "QPJzr-I9XQ7l"
|
27 |
},
|
|
|
32 |
},
|
33 |
{
|
34 |
"cell_type": "code",
|
35 |
+
"execution_count": null,
|
36 |
"metadata": {
|
37 |
"id": "riuXwpSPcvWC"
|
38 |
},
|
|
|
46 |
},
|
47 |
{
|
48 |
"cell_type": "code",
|
49 |
+
"execution_count": null,
|
50 |
"metadata": {
|
51 |
"id": "km-KQOrgr3VB"
|
52 |
},
|
|
|
70 |
},
|
71 |
{
|
72 |
"cell_type": "code",
|
73 |
+
"execution_count": null,
|
74 |
"metadata": {
|
75 |
"id": "SQP87lHczHKc"
|
76 |
},
|
|
|
504 |
},
|
505 |
{
|
506 |
"cell_type": "code",
|
507 |
+
"execution_count": null,
|
508 |
"metadata": {
|
509 |
"id": "mNDd5i921Hww"
|
510 |
},
|
|
|
632 |
},
|
633 |
{
|
634 |
"cell_type": "code",
|
635 |
+
"execution_count": null,
|
636 |
"metadata": {
|
637 |
"colab": {
|
638 |
"base_uri": "https://localhost:8080/"
|
|
|
662 |
"from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner\n",
|
663 |
"from llama_index.llms.openai import OpenAI\n",
|
664 |
"\n",
|
665 |
+
"# Define an LLM as a judge\n",
|
666 |
"llm_gpt4 = OpenAI(temperature=0, model=\"gpt-4o\")\n",
|
667 |
"\n",
|
668 |
+
"# Initiate the faithfulnes and relevancy evaluator objects\n",
|
669 |
"faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4)\n",
|
670 |
"relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4)\n",
|
671 |
"\n",
|
672 |
+
"# Extract the questions from the dataset\n",
|
673 |
"queries = list(rag_eval_dataset.queries.values())\n",
|
674 |
+
"# Limit to first 20 question to save cost (!!remove this line in production!!)\n",
|
675 |
"batch_eval_queries = queries[:20]\n",
|
676 |
"\n",
|
677 |
+
"# The batch evaluator runs the evaluation in batches\n",
|
678 |
"runner = BatchEvalRunner(\n",
|
679 |
"{\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n",
|
680 |
"workers=32,\n",
|
681 |
")\n",
|
682 |
"\n",
|
683 |
+
"# Define a for-loop to try different `similarity_top_k` values\n",
|
684 |
"for i in [2, 4, 6, 8, 10]:\n",
|
685 |
+
" # Set query engine with different number of returned chunks\n",
|
686 |
" query_engine = index.as_query_engine(similarity_top_k=i)\n",
|
687 |
"\n",
|
688 |
+
" # Run the evaluation\n",
|
689 |
" eval_results = await runner.aevaluate_queries(\n",
|
690 |
" query_engine, queries=batch_eval_queries\n",
|
691 |
" )\n",
|
692 |
+
"\n",
|
693 |
+
" # Printing the results\n",
|
694 |
" faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])\n",
|
695 |
" print(f\"top_{i} faithfulness_score: {faithfulness_score}\")\n",
|
696 |
"\n",
|
|
|
709 |
},
|
710 |
{
|
711 |
"cell_type": "code",
|
712 |
+
"execution_count": null,
|
713 |
"metadata": {
|
714 |
"id": "aUulxzuh1Hwx"
|
715 |
},
|
|
|
751 |
"metadata": {
|
752 |
"id": "CYIjkAP74bly"
|
753 |
},
|
754 |
+
"execution_count": null,
|
755 |
"outputs": []
|
756 |
},
|
757 |
{
|
|
|
766 |
"id": "-3b-bgvA4dAz",
|
767 |
"outputId": "7ced2102-6372-4794-82ad-1c7e60438088"
|
768 |
},
|
769 |
+
"execution_count": null,
|
770 |
"outputs": [
|
771 |
{
|
772 |
"output_type": "execute_result",
|
|
|
793 |
"id": "KNEhRQAo4dT0",
|
794 |
"outputId": "4a5d7db9-b399-49ea-c90e-b1e076640a92"
|
795 |
},
|
796 |
+
"execution_count": null,
|
797 |
"outputs": [
|
798 |
{
|
799 |
"output_type": "execute_result",
|