AlaFalaki commited on
Commit
e68e63d
Β·
1 Parent(s): f798896

Created using Colab

Browse files
Files changed (1) hide show
  1. notebooks/06-Evaluate_RAG.ipynb +20 -12
notebooks/06-Evaluate_RAG.ipynb CHANGED
@@ -21,7 +21,7 @@
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": 17,
25
  "metadata": {
26
  "id": "QPJzr-I9XQ7l"
27
  },
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "cell_type": "code",
35
- "execution_count": 18,
36
  "metadata": {
37
  "id": "riuXwpSPcvWC"
38
  },
@@ -46,7 +46,7 @@
46
  },
47
  {
48
  "cell_type": "code",
49
- "execution_count": 19,
50
  "metadata": {
51
  "id": "km-KQOrgr3VB"
52
  },
@@ -70,7 +70,7 @@
70
  },
71
  {
72
  "cell_type": "code",
73
- "execution_count": 1,
74
  "metadata": {
75
  "id": "SQP87lHczHKc"
76
  },
@@ -504,7 +504,7 @@
504
  },
505
  {
506
  "cell_type": "code",
507
- "execution_count": 21,
508
  "metadata": {
509
  "id": "mNDd5i921Hww"
510
  },
@@ -632,7 +632,7 @@
632
  },
633
  {
634
  "cell_type": "code",
635
- "execution_count": 22,
636
  "metadata": {
637
  "colab": {
638
  "base_uri": "https://localhost:8080/"
@@ -662,27 +662,35 @@
662
  "from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner\n",
663
  "from llama_index.llms.openai import OpenAI\n",
664
  "\n",
 
665
  "llm_gpt4 = OpenAI(temperature=0, model=\"gpt-4o\")\n",
666
  "\n",
 
667
  "faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4)\n",
668
  "relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4)\n",
669
  "\n",
670
- "# Run evaluation\n",
671
  "queries = list(rag_eval_dataset.queries.values())\n",
 
672
  "batch_eval_queries = queries[:20]\n",
673
  "\n",
 
674
  "runner = BatchEvalRunner(\n",
675
  "{\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n",
676
  "workers=32,\n",
677
  ")\n",
678
  "\n",
 
679
  "for i in [2, 4, 6, 8, 10]:\n",
680
- " # Set Faithfulness and Relevancy evaluators\n",
681
  " query_engine = index.as_query_engine(similarity_top_k=i)\n",
682
  "\n",
 
683
  " eval_results = await runner.aevaluate_queries(\n",
684
  " query_engine, queries=batch_eval_queries\n",
685
  " )\n",
 
 
686
  " faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])\n",
687
  " print(f\"top_{i} faithfulness_score: {faithfulness_score}\")\n",
688
  "\n",
@@ -701,7 +709,7 @@
701
  },
702
  {
703
  "cell_type": "code",
704
- "execution_count": 32,
705
  "metadata": {
706
  "id": "aUulxzuh1Hwx"
707
  },
@@ -743,7 +751,7 @@
743
  "metadata": {
744
  "id": "CYIjkAP74bly"
745
  },
746
- "execution_count": 33,
747
  "outputs": []
748
  },
749
  {
@@ -758,7 +766,7 @@
758
  "id": "-3b-bgvA4dAz",
759
  "outputId": "7ced2102-6372-4794-82ad-1c7e60438088"
760
  },
761
- "execution_count": 34,
762
  "outputs": [
763
  {
764
  "output_type": "execute_result",
@@ -785,7 +793,7 @@
785
  "id": "KNEhRQAo4dT0",
786
  "outputId": "4a5d7db9-b399-49ea-c90e-b1e076640a92"
787
  },
788
- "execution_count": 35,
789
  "outputs": [
790
  {
791
  "output_type": "execute_result",
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": null,
25
  "metadata": {
26
  "id": "QPJzr-I9XQ7l"
27
  },
 
32
  },
33
  {
34
  "cell_type": "code",
35
+ "execution_count": null,
36
  "metadata": {
37
  "id": "riuXwpSPcvWC"
38
  },
 
46
  },
47
  {
48
  "cell_type": "code",
49
+ "execution_count": null,
50
  "metadata": {
51
  "id": "km-KQOrgr3VB"
52
  },
 
70
  },
71
  {
72
  "cell_type": "code",
73
+ "execution_count": null,
74
  "metadata": {
75
  "id": "SQP87lHczHKc"
76
  },
 
504
  },
505
  {
506
  "cell_type": "code",
507
+ "execution_count": null,
508
  "metadata": {
509
  "id": "mNDd5i921Hww"
510
  },
 
632
  },
633
  {
634
  "cell_type": "code",
635
+ "execution_count": null,
636
  "metadata": {
637
  "colab": {
638
  "base_uri": "https://localhost:8080/"
 
662
  "from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner\n",
663
  "from llama_index.llms.openai import OpenAI\n",
664
  "\n",
665
+ "# Define an LLM as a judge\n",
666
  "llm_gpt4 = OpenAI(temperature=0, model=\"gpt-4o\")\n",
667
  "\n",
668
+ "# Initiate the faithfulnes and relevancy evaluator objects\n",
669
  "faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4)\n",
670
  "relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4)\n",
671
  "\n",
672
+ "# Extract the questions from the dataset\n",
673
  "queries = list(rag_eval_dataset.queries.values())\n",
674
+ "# Limit to first 20 question to save cost (!!remove this line in production!!)\n",
675
  "batch_eval_queries = queries[:20]\n",
676
  "\n",
677
+ "# The batch evaluator runs the evaluation in batches\n",
678
  "runner = BatchEvalRunner(\n",
679
  "{\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n",
680
  "workers=32,\n",
681
  ")\n",
682
  "\n",
683
+ "# Define a for-loop to try different `similarity_top_k` values\n",
684
  "for i in [2, 4, 6, 8, 10]:\n",
685
+ " # Set query engine with different number of returned chunks\n",
686
  " query_engine = index.as_query_engine(similarity_top_k=i)\n",
687
  "\n",
688
+ " # Run the evaluation\n",
689
  " eval_results = await runner.aevaluate_queries(\n",
690
  " query_engine, queries=batch_eval_queries\n",
691
  " )\n",
692
+ "\n",
693
+ " # Printing the results\n",
694
  " faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])\n",
695
  " print(f\"top_{i} faithfulness_score: {faithfulness_score}\")\n",
696
  "\n",
 
709
  },
710
  {
711
  "cell_type": "code",
712
+ "execution_count": null,
713
  "metadata": {
714
  "id": "aUulxzuh1Hwx"
715
  },
 
751
  "metadata": {
752
  "id": "CYIjkAP74bly"
753
  },
754
+ "execution_count": null,
755
  "outputs": []
756
  },
757
  {
 
766
  "id": "-3b-bgvA4dAz",
767
  "outputId": "7ced2102-6372-4794-82ad-1c7e60438088"
768
  },
769
+ "execution_count": null,
770
  "outputs": [
771
  {
772
  "output_type": "execute_result",
 
793
  "id": "KNEhRQAo4dT0",
794
  "outputId": "4a5d7db9-b399-49ea-c90e-b1e076640a92"
795
  },
796
+ "execution_count": null,
797
  "outputs": [
798
  {
799
  "output_type": "execute_result",