{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "IlpbWxQoTPuM", "gpIQKC5mTPq4", "4cRtp0oHxhZ9", "FzsRbuujpEAp" ] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "3e15c168161d425882e0a4beeaac668f": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_268b92af58f6402ba74f3ca7f0fe52c4", "IPY_MODEL_426870965d2f4e4f8bc35ab55c88a601", "IPY_MODEL_78614aec5ee24d9aaff3d265427b015b" ], "layout": "IPY_MODEL_5d2e72bd064048abba71d8cd5c92b97a" } }, "268b92af58f6402ba74f3ca7f0fe52c4": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b179e6bb1c974a2e95133d090eeee528", "placeholder": "​", "style": "IPY_MODEL_3c1ac86c64ba4d419e461416139e10ab", "value": "tokenizer_config.json: 100%" } }, "426870965d2f4e4f8bc35ab55c88a601": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_55dac8906d4d423081ecadac8366a7cc", "max": 443, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_3ad35b1044f2447f8fa2bbdc631f5c9e", "value": 443 } }, "78614aec5ee24d9aaff3d265427b015b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_891f203e5e41499bb16b1be85e21e206", "placeholder": "​", "style": "IPY_MODEL_253314cb48da4f41819365934fdf13a0", "value": " 443/443 [00:00<00:00, 22.0kB/s]" } }, "5d2e72bd064048abba71d8cd5c92b97a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b179e6bb1c974a2e95133d090eeee528": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3c1ac86c64ba4d419e461416139e10ab": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "55dac8906d4d423081ecadac8366a7cc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3ad35b1044f2447f8fa2bbdc631f5c9e": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "891f203e5e41499bb16b1be85e21e206": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "253314cb48da4f41819365934fdf13a0": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "4d29001743a64fbebfd60aa841ab1df9": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_6341cf1844f64790a7323779b305a97a", "IPY_MODEL_13b193574fe147baaab6768d1a902012", "IPY_MODEL_ecfc17da497a45929007f70089b515c1" ], "layout": "IPY_MODEL_d63a37cff91949fbb299c2a0585cfe29" } }, "6341cf1844f64790a7323779b305a97a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_dba1a149fb884216a8531f023deda8f7", "placeholder": "​", "style": "IPY_MODEL_f6f694c2dbf848af9bf1eab5bacaf87b", "value": "sentencepiece.bpe.model: 100%" } }, "13b193574fe147baaab6768d1a902012": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_66c42342907e423499a87ebec8a89126", "max": 5069051, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_f89f483057dd4ed8a47a3a442dd4a352", "value": 5069051 } }, "ecfc17da497a45929007f70089b515c1": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d7b2b9a9ac264e479e58aef5b4bd7ff1", "placeholder": "​", "style": "IPY_MODEL_0b5c1768a89a4435b66166694f542353", "value": " 5.07M/5.07M [00:00<00:00, 84.4MB/s]" } }, "d63a37cff91949fbb299c2a0585cfe29": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "dba1a149fb884216a8531f023deda8f7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f6f694c2dbf848af9bf1eab5bacaf87b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "66c42342907e423499a87ebec8a89126": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f89f483057dd4ed8a47a3a442dd4a352": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "d7b2b9a9ac264e479e58aef5b4bd7ff1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0b5c1768a89a4435b66166694f542353": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "71b38e326aca42c58378dbe57130ce95": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_59f099c6e335467482d5b09c52d18e98", "IPY_MODEL_f011642cf97d4b9a868a9fdd6d205b05", "IPY_MODEL_9846e30942cb4441938144dbc2e27dfa" ], "layout": "IPY_MODEL_6b49a5e8051f4aa8a4e0f4c74fee2694" } }, "59f099c6e335467482d5b09c52d18e98": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d2355c7621a84e3da840546f30450e65", "placeholder": "​", "style": "IPY_MODEL_e4aeae4887c2493b901bbf5f23a5d01a", "value": "tokenizer.json: 100%" } }, "f011642cf97d4b9a868a9fdd6d205b05": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_aa5b043eb38d4a5db20887e6404d9ee0", "max": 17098107, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_b3bc42f4c4284e82b13b5e5f4ab1bb25", "value": 17098107 } }, "9846e30942cb4441938144dbc2e27dfa": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b03e452a939544f6bc58f1e2a2eca207", "placeholder": "​", "style": "IPY_MODEL_4a76f60c35e54ce8888086cd0204d47f", "value": " 17.1M/17.1M [00:00<00:00, 163MB/s]" } }, "6b49a5e8051f4aa8a4e0f4c74fee2694": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d2355c7621a84e3da840546f30450e65": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e4aeae4887c2493b901bbf5f23a5d01a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "aa5b043eb38d4a5db20887e6404d9ee0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b3bc42f4c4284e82b13b5e5f4ab1bb25": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b03e452a939544f6bc58f1e2a2eca207": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4a76f60c35e54ce8888086cd0204d47f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "d23e04171b39497eb6489517e5aad244": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_d6cfa575047d42bebf17d20cd2b650a7", "IPY_MODEL_53c7c1594de3495daf81c7e9f036e54d", "IPY_MODEL_2b50bc6f4c884d7ea93c811f5a91b6ed" ], "layout": "IPY_MODEL_f58c8396009441189b461246aa615a54" } }, "d6cfa575047d42bebf17d20cd2b650a7": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0b8814761c014f2f96e9a6f1b8f5b10a", "placeholder": "​", "style": "IPY_MODEL_be51a336a39a4ef3b8125d230d35205d", "value": "special_tokens_map.json: 100%" } }, "53c7c1594de3495daf81c7e9f036e54d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_2feb20f2e8d440718cb5dc3426294a81", "max": 279, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_59bf397dd37f441db63314b6748fc1c1", "value": 279 } }, "2b50bc6f4c884d7ea93c811f5a91b6ed": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4b29621ed2a242dc99daa1ebc983dc8f", "placeholder": "​", "style": "IPY_MODEL_dd39ca117b5c491595673c4bed83744a", "value": " 279/279 [00:00<00:00, 13.0kB/s]" } }, "f58c8396009441189b461246aa615a54": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0b8814761c014f2f96e9a6f1b8f5b10a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "be51a336a39a4ef3b8125d230d35205d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "2feb20f2e8d440718cb5dc3426294a81": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "59bf397dd37f441db63314b6748fc1c1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "4b29621ed2a242dc99daa1ebc983dc8f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "dd39ca117b5c491595673c4bed83744a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3e7be8aa507441b5ab7bf4d23d3afb41": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_ae6b8258503c455c8ed1ac95b36ac0a6", "IPY_MODEL_8ad7f38f61684a8185d959090e13609c", "IPY_MODEL_abfa07cf95f64cbd973c7438996089b6" ], "layout": "IPY_MODEL_de66a31a420a44aca3bc98240978787e" } }, "ae6b8258503c455c8ed1ac95b36ac0a6": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ec2611b26d344c7b93a47bfb2335c1d7", "placeholder": "​", "style": "IPY_MODEL_9c3aad95488a4edbaf880b20588225d6", "value": "config.json: 100%" } }, "8ad7f38f61684a8185d959090e13609c": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_055cf9c25db64977a3d92255074f6510", "max": 801, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_d93361ce3040400e8288f2fc52dfad38", "value": 801 } }, "abfa07cf95f64cbd973c7438996089b6": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e53db6a60f5745c1ac10590b869525ad", "placeholder": "​", "style": "IPY_MODEL_c7765939991c4f259e83453f6cbc25ae", "value": " 801/801 [00:00<00:00, 30.4kB/s]" } }, "de66a31a420a44aca3bc98240978787e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ec2611b26d344c7b93a47bfb2335c1d7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9c3aad95488a4edbaf880b20588225d6": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "055cf9c25db64977a3d92255074f6510": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d93361ce3040400e8288f2fc52dfad38": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "e53db6a60f5745c1ac10590b869525ad": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c7765939991c4f259e83453f6cbc25ae": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ef92065e42924a929b6e68692828b150": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7bf32013df954c038c2d332b2786b906", "IPY_MODEL_37e93cd73ae54ac9ae099143a7a63cf9", "IPY_MODEL_a32f0269bb0446afa3c07c3230fcc5a6" ], "layout": "IPY_MODEL_a26de4d1fb6245eca41e4ba975216963" } }, "7bf32013df954c038c2d332b2786b906": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e0490758fecf45a8bfb6388e4cc88ed1", "placeholder": "​", "style": "IPY_MODEL_3ccf552947874aba978c096885e46e1a", "value": "model.safetensors: 100%" } }, "37e93cd73ae54ac9ae099143a7a63cf9": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_250c3b5c71074baea3f7d8f5d3035bd0", "max": 2239618772, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_10d41298d032446c82ce00a4ac9a1c3d", "value": 2239618772 } }, "a32f0269bb0446afa3c07c3230fcc5a6": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ff2ce9f4ac01478fbc7c4189b6534cf9", "placeholder": "​", "style": "IPY_MODEL_a2e43fcdc48d448fbaf8fbcab6a2dcdc", "value": " 2.24G/2.24G [00:28<00:00, 33.3MB/s]" } }, "a26de4d1fb6245eca41e4ba975216963": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e0490758fecf45a8bfb6388e4cc88ed1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3ccf552947874aba978c096885e46e1a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "250c3b5c71074baea3f7d8f5d3035bd0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "10d41298d032446c82ce00a4ac9a1c3d": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "ff2ce9f4ac01478fbc7c4189b6534cf9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a2e43fcdc48d448fbaf8fbcab6a2dcdc": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "# Midterm Certification Challenge: Building and Deploying a RAG Application\n", "DUE DATE: Before 4:00 PM PT on May 2 (before next Thursday's class!)\n", "\n", "You are to record the total time it takes you to complete\n", "\n", "You have access to all boiler-plate code from the course, and we highly encourage you to leverage it!\n", "\n", "**Deliverables:**\n", "\n", "**Build 🏗️**\n", "\n", "* Data: Meta 10-k Filings\n", "* LLM: OpenAI GPT-3.5-turbo\n", "* Embedding Model: text-3-embedding small\n", "* Infrastructure: LangChain or LlamaIndex (you choose)\n", "* Vector Store: Qdrant\n", "* Deployment: Chainlit, Hugging Face\n", "**Ship 🚢**\n", "\n", "* Evaluate your answers to the following questions\n", "\"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n", "\"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n", "* Record <10 min loom video walkthrough\n", "$$ Extra Credit: Baseline retrieval performance w/ RAGAS, change something about your RAG system to improve it, then show the improvement quantitatively!\n", "\n", "**Share 🚀**\n", "* Share lessons not yet learned in #aie2-general" ], "metadata": { "id": "uDsowVwcRyZ8" } }, { "cell_type": "markdown", "source": [ "## Install Dependencies" ], "metadata": { "id": "QojysEo6Soqb" } }, { "cell_type": "code", "source": [ "import nest_asyncio\n", "\n", "nest_asyncio.apply()" ], "metadata": { "id": "hizlCdZeh1i7" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "BXSRaRN2RixC", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "741f1a21-e5f1-41eb-c064-bb91d7d19962" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.4/15.4 MB\u001b[0m \u001b[31m67.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m51.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m312.9/312.9 kB\u001b[0m \u001b[31m27.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m141.9/141.9 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m73.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "!pip install llama-parse llama_index -qU" ] }, { "cell_type": "markdown", "source": [ "## Set Environment Variables" ], "metadata": { "id": "8tpilHaaSoSg" } }, { "cell_type": "code", "source": [ "import os\n", "from getpass import getpass\n", "\n", "# set openai key\n", "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API Key:\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AfvigvZfTXjX", "outputId": "d6e11044-c229-4281-f8f2-320b93515875" }, "execution_count": 3, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OpenAI API Key:··········\n" ] } ] }, { "cell_type": "code", "source": [ "# set llama cloud key\n", "os.environ[\"LLAMA_CLOUD_API_KEY\"] = getpass(\"Llama Cloud API Key:\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4WVa_areeqb4", "outputId": "1aa743aa-e178-483b-fd45-4028a2ed2591" }, "execution_count": 4, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Llama Cloud API Key:··········\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Download the Data" ], "metadata": { "id": "p0BhZDfbk2KT" } }, { "cell_type": "code", "source": [ "# download the data\n", "!mkdir 'data'\n", "!wget 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf' -O 'data/Meta_10k.pdf'" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XJbBJ17zieee", "outputId": "65bd41b0-fffe-4987-f02c-3af985ee9670" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2024-05-02 08:59:39-- https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf\n", "Resolving d18rn0p25nwr6d.cloudfront.net (d18rn0p25nwr6d.cloudfront.net)... 108.138.113.53, 108.138.113.114, 108.138.113.63, ...\n", "Connecting to d18rn0p25nwr6d.cloudfront.net (d18rn0p25nwr6d.cloudfront.net)|108.138.113.53|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 2481466 (2.4M) [application/pdf]\n", "Saving to: ‘data/Meta_10k.pdf’\n", "\n", "\rdata/Meta_10k.pdf 0%[ ] 0 --.-KB/s \rdata/Meta_10k.pdf 100%[===================>] 2.37M --.-KB/s in 0.05s \n", "\n", "2024-05-02 08:59:40 (46.0 MB/s) - ‘data/Meta_10k.pdf’ saved [2481466/2481466]\n", "\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Llama Index Vanilla RAG" ], "metadata": { "id": "IlpbWxQoTPuM" } }, { "cell_type": "code", "source": [ "# import dependencies\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "\n", "# load the document\n", "documents = SimpleDirectoryReader('data').load_data()\n", "\n", "# create an index from the documents\n", "index = VectorStoreIndex.from_documents(documents)\n", "\n", "# create a query engine for the index\n", "query_engine = index.as_query_engine()" ], "metadata": { "id": "DkSbBHkLTZCG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# check how many nodes we have\n", "len(documents)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wGJgx_BWjGI1", "outputId": "1e257a1f-d788-4052-ce7f-40d3b4025cfd" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "147" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "# query the engine\n", "query1 = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n", "response = query_engine.query(query1)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "INwqOrGajLLz", "outputId": "94cd81ea-b011-4396-f735-130771e5934a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The total value of 'Cash and cash equivalents' as of December 31, 2023 was $14.681 billion.\n" ] } ] }, { "cell_type": "code", "source": [ "# query the engine\n", "query2 = \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n", "response = query_engine.query(query2)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "1413b41c-d5b6-497d-cfa8-022769d6e841", "id": "tjuGkUUZkQHS" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Meta's 'Directors' or members of the Board of Directors include individuals who have the authority to call stockholders, such as a majority of the board of directors, the chairman of the board of directors, the chief executive officer, or the president.\n" ] } ] }, { "cell_type": "markdown", "source": [ "## RAG with Llama Parse" ], "metadata": { "id": "gpIQKC5mTPq4" } }, { "cell_type": "code", "source": [ "# import dependencies\n", "from llama_parse import LlamaParse\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "\n", "# setup parser\n", "parser = LlamaParse(\n", " result_type=\"markdown\"\n", ")\n", "\n", "# load and parse the documet\n", "file_extractor = {\".pdf\": parser}\n", "documents = SimpleDirectoryReader(\n", " input_files=['data/Meta_10k.pdf'],\n", " file_extractor=file_extractor\n", ").load_data()\n", "\n", "# create an index from the parsed markdown\n", "index = VectorStoreIndex.from_documents(documents)\n", "\n", "# create a new query engine for the index\n", "new_query_engine = index.as_query_engine()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4pkkZjz1kt4J", "outputId": "dc110ed9-b54c-4d09-bbdb-dbc87407f360" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Started parsing the file under job_id 192255e6-0a6d-4ad7-9d19-b0b8d2c67cad\n" ] } ] }, { "cell_type": "code", "source": [ "len(documents)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "td1IQrSlmo5u", "outputId": "3846af7a-7aeb-42d0-e18a-0f3736195e87" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "documents[0].text[:1000]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 140 }, "id": "PfqaJ7ovWqL7", "outputId": "c2df90ab-6d0b-4dcc-b1ad-4ebc5263a9f9" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\"## UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K\\n\\n(Mark One) ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31, 2023\\n\\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 or For the transition period from to Commission File Number: 001-35551\\n\\nMeta Platforms, Inc. Meta (Exact name of registrant as specified in its charter) Delaware 20-1665019 (State or other jurisdiction of incorporation or organization) 1 Meta Way, Menlo Park, California 94025 (I.R.S. Employer Identification Number) (Address of principal executive offices and Zip Code) (650) 543-4800 (Registrant's telephone number, including area code) Securities registered pursuant to Section 12(b) of the Act:\\n\\n|Title of each class|Trading symbol(s)|Name of each exchange on which registered|\\n|---|---|---|\\n|Class A Common Stock, $0.000006 par value|META|The Nasdaq Stock Market LLC|\\n\\nSecuri\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "# query the engine\n", "query1 = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n", "response = new_query_engine.query(query1)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f647baea-e0c9-4b21-d392-e5c43fbf4ae4", "id": "aIUIGg9Tm3x0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "$65.40 billion\n" ] } ] }, { "cell_type": "code", "source": [ "# query the engine\n", "query2 = \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n", "response = new_query_engine.query(query2)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "51b490f1-3447-47f2-86d5-0ac875043e9c", "id": "rs5vUFJmm3x0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The members of Meta's Board of Directors are referred to as the Compensation, Nominating & Governance Committee of the Board.\n" ] } ] }, { "cell_type": "markdown", "source": [ "## RAG with Llama Parse + Recursive Chunking" ], "metadata": { "id": "4cRtp0oHxhZ9" } }, { "cell_type": "code", "source": [ "# import dependencies\n", "from llama_parse import LlamaParse\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.core.node_parser import MarkdownElementNodeParser\n", "\n", "parsing_instruction = \"\"\"The provided document is a annual report filed by Meta Platforms,\n", " Inc. with the Securities and Exchange Commission (SEC).\n", " This form provides detailed financial information about the company's performance for a specific year.\n", " It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.\n", " It contains many tables.\n", " Try to be precise while answering the questions\"\"\"\n", "\n", "# setup parser\n", "parser = LlamaParse(\n", " result_type=\"markdown\",\n", " parsing_instruction=parsing_instruction\n", "\n", ")\n", "\n", "# load and parse the documet\n", "file_extractor = {\".pdf\": parser}\n", "documents = SimpleDirectoryReader(\n", " input_files=['data/Meta_10k.pdf'],\n", " file_extractor=file_extractor\n", ").load_data()\n", "\n", "# setup markdown node parser\n", "node_parser = MarkdownElementNodeParser()\n", "\n", "# parse the mardown document\n", "nodes = node_parser.get_nodes_from_documents(documents)\n", "base_nodes, objects = node_parser.get_nodes_and_objects(nodes)\n", "\n", "# create an index from the parsed markdown\n", "index = VectorStoreIndex(nodes=base_nodes+objects)\n", "\n", "# create a new query engine for the index\n", "recursive_query_engine = index.as_query_engine()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c740f9b7-0246-46ee-dd71-83fb0c5cd189", "id": "vrpkJFXCxhZ-" }, "execution_count": 22, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Started parsing the file under job_id e22c88c7-4e1b-46b0-b5fb-c02f8ed1fb41\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "70it [00:00, 16995.73it/s]\n", " 6%|▌ | 4/70 [00:03<00:48, 1.37it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", " 7%|▋ | 5/70 [00:03<00:42, 1.51it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", " 33%|███▎ | 23/70 [00:15<00:27, 1.70it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", " 41%|████▏ | 29/70 [00:19<00:23, 1.73it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", " 46%|████▌ | 32/70 [00:23<00:30, 1.26it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", " 50%|█████ | 35/70 [00:25<00:24, 1.43it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", " 54%|█████▍ | 38/70 [00:27<00:17, 1.85it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", " 93%|█████████▎| 65/70 [00:44<00:03, 1.47it/s]WARNING:llama_index.core.response_synthesizers.refine:Validation error on structured response: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 482, in _agive_response_single\n", " structured_response = await program.acall(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py\", line 92, in acall\n", " answer = await self._llm.astructured_predict(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/instrumentation/dispatcher.py\", line 307, in async_wrapper\n", " result = await func(*args, **kwargs)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py\", line 391, in astructured_predict\n", " result = await program.acall(**prompt_args)\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 223, in acall\n", " return _parse_tool_calls(\n", " File \"/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py\", line 67, in _parse_tool_calls\n", " output = output_cls.parse_raw(function_call.arguments)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 549, in parse_raw\n", " return cls.parse_obj(obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 526, in parse_obj\n", " return cls(**obj)\n", " File \"/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py\", line 341, in __init__\n", " raise validation_error\n", "pydantic.v1.error_wrappers.ValidationError: 1 validation error for TableOutput\n", "columns\n", " field required (type=value_error.missing)\n", "100%|██████████| 70/70 [00:48<00:00, 1.44it/s]\n" ] } ] }, { "cell_type": "code", "source": [ "len(nodes)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "3acdb382-693c-434e-885e-381656d46cb9", "id": "DvNAzF1NxhZ-" }, "execution_count": 23, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "269" ] }, "metadata": {}, "execution_count": 23 } ] }, { "cell_type": "code", "source": [ "len(base_nodes)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "d6e5ee64-d441-4888-fa60-0f74d590c704", "id": "ydp5ZTz7zZ0c" }, "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "129" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "# query the engine\n", "query1 = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n", "response = recursive_query_engine.query(query1)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "90475e20-942f-49a5-c03d-d1c7176c76b5", "id": "Cv2WYsIexhZ-" }, "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862.\n" ] } ] }, { "cell_type": "code", "source": [ "# query the engine\n", "query2 = \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n", "response = recursive_query_engine.query(query2)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "1da178cb-bc97-4026-ad0d-b8fb648658b1", "id": "PhjQjIg9xhZ-" }, "execution_count": 26, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mark Zuckerberg is the Board Chair and Chief Executive Officer of Meta Platforms, Inc. Susan Li is the Chief Financial Officer of Meta Platforms, Inc.\n" ] } ] }, { "cell_type": "markdown", "source": [ "## RAG with Llama Parse + LangChain RecursiveCharacterTextSplitter" ], "metadata": { "id": "j9I3akvG2AjJ" } }, { "cell_type": "code", "source": [ "!pip install -qU langchain langchain-core langchain-community langchain-openai unstructured" ], "metadata": { "id": "uZJ6AIUs3c4j" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install -qU qdrant-client" ], "metadata": { "id": "5quBcn6K39hF" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Let's first save the llama_parse markdown document" ], "metadata": { "id": "QiAk6mbE6E4L" } }, { "cell_type": "code", "source": [ "# import dependencies\n", "from llama_parse import LlamaParse\n", "from llama_index.core import SimpleDirectoryReader\n", "\n", "# setup parser\n", "parser = LlamaParse(\n", " result_type=\"markdown\",\n", " parsing_instruction=parsing_instruction\n", ")\n", "\n", "parsing_instruction = \"\"\"The provided document is an annual report filed by Meta Platforms, Inc. with the Securities and Exchange Commission (SEC).\n", "This form provides detailed financial information about the company's performance for a specific year.\n", "It includes financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.\n", "It contains many tables and some signature pages.\n", "\n", "Extract the signatures as a table.\n", "\"\"\"\n", "\n", "# load and parse the documet\n", "file_extractor = {\".pdf\": parser}\n", "llama_parse_documents = SimpleDirectoryReader(\n", " input_files=['data/Meta_10k.pdf'],\n", " file_extractor=file_extractor\n", ").load_data()\n", "\n", "# save markdown file\n", "data_file = \"./data/output.md\"\n", "with open(data_file, \"a\") as f:\n", " for doc in llama_parse_documents:\n", " f.write(doc.text + '\\n')" ], "metadata": { "id": "Q9LXiA4U12aM" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Now we'll setup the langchain RAG with Qdrant" ], "metadata": { "id": "JlwmRXZi6PYu" } }, { "cell_type": "code", "source": [ "# import dependencies\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain_community.vectorstores import Qdrant\n", "from langchain_community.document_loaders import DirectoryLoader\n", "from langchain_openai.embeddings import OpenAIEmbeddings\n", "\n", "# load the document\n", "loader = DirectoryLoader(path='data/', glob=\"**/*.md\", show_progress=True)\n", "documents = loader.load()\n", "\n", "# split the document into chunks\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)\n", "docs = text_splitter.split_documents(documents)\n", "\n", "# instantiate embeddings\n", "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", "\n", "# create the vectorstore\n", "qdrant_vector_store = Qdrant.from_documents(\n", " documents=docs,\n", " embedding=embeddings,\n", " location=\":memory:\",\n", " collection_name=\"meta_10k\"\n", ")\n", "\n", "# setup our retriever\n", "qdrant_retriever = qdrant_vector_store.as_retriever()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mkq3v2XX5oY0", "outputId": "9c5e5da6-2546-4274-a83f-43f28b560c2e" }, "execution_count": 144, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "100%|██████████| 1/1 [00:04<00:00, 4.01s/it]\n" ] } ] }, { "cell_type": "markdown", "source": [ "Setup the RAG Prompt" ], "metadata": { "id": "n5l5oZwq_1zq" } }, { "cell_type": "code", "source": [ "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "RAG_PROMPT = \"\"\"\n", "CONTEXT:\n", "{context}\n", "\n", "QUERY:\n", "{question}\n", "\n", "The provided context is an annual report filed by Meta Platforms, Inc. with the Securities and Exchange Commission (SEC).\n", "This form provides detailed financial information about the company's performance for a specific year.\n", "It includes financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.\n", "It contains many tables and some signature pages. All members of the board need to sign the document.\n", "\n", "Answer the query above only using the context provided. If you don't know the answer, simply say 'I don't know'.\n", "\"\"\"\n", "\n", "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)" ], "metadata": { "id": "MFvFEItd_4nq" }, "execution_count": 188, "outputs": [] }, { "cell_type": "markdown", "source": [ "Finally, we create our chain..." ], "metadata": { "id": "SlmhFU4DACib" } }, { "cell_type": "code", "source": [ "from operator import itemgetter\n", "from langchain_core.runnables import RunnablePassthrough\n", "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_openai import ChatOpenAI\n", "\n", "chat_model = ChatOpenAI(model=\"gpt-3.5-turbo\")\n", "\n", "rag_chain = (\n", " {\"question\": itemgetter(\"question\"), \"context\": itemgetter(\"question\") | qdrant_retriever}\n", " | RunnablePassthrough().assign(context=itemgetter(\"context\"))\n", " | {\"response\":rag_prompt | chat_model | StrOutputParser(), \"context\": itemgetter(\"context\")}\n", ")" ], "metadata": { "id": "ptOBMYQiAHKG" }, "execution_count": 189, "outputs": [] }, { "cell_type": "code", "source": [ "# query the rag_chain\n", "query1 = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n", "response = rag_chain.invoke({\"question\": query1})\n", "print(response['response'])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "727522fb-f00f-4070-c526-5c9c34a32745", "id": "tO61pDX012aN" }, "execution_count": 190, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862 million.\n" ] } ] }, { "cell_type": "code", "source": [ "for context in response['context']:\n", " print(context)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v03JzLwwjIBm", "outputId": "4b19b810-42e5-4774-c1e8-4aff729c22a7" }, "execution_count": 191, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "page_content='Due within one year $ December 31, 2023 7,120 Due after one year to five years 16,421 Total 23,541\\n\\nInstruments Measured at Fair Value on Non-recurring Basis\\n\\nOur non-marketable equity securities accounted for using the measurement alternative are measured at fair value on a non-recurring basis and are classified within Level 3 of the fair value hierarchy because we use significant unobservable inputs to estimate their fair value. Assets remeasured at fair value on a non-recurring basis within Level 3 during the years ended December 31, 2023 and 2022 were $53 million and $198 million, respectively. For additional information, see Note 6 — Non-marketable Equity Securities.\\n\\nNote 6. Non-marketable Equity Securities\\n\\nOur non-marketable equity securities are investments in privately-held companies without readily determinable fair values. The following table summarizes our non-marketable equity securities that were measured using measurement alternative and equity method (in millions):\\n\\n2023 December 31 2022 Non-marketable equity securities under measurement alternative Initial cost $ 6,389 $ 6,388 Cumulative upward adjustments 293 293 Cumulative impairment/downward adjustments (599) (497) Carrying value 6,083 6,184 Non-marketable equity securities under equity method 58 17 Total $ 6,141 $ 6,201\\n\\nDuring the years ended December 31, 2023, 2022 and 2021, impairment and downward adjustments recorded for our non-marketable equity securities that were measured using measurement alternative was $101 million, $447 million, and immaterial, respectively.\\n\\nSignatures\\n\\nSignature Date John Doe January 15, 2024 Jane Smith January 16, 2024 --- # Meta Platforms, Inc. - Annual Report\\n\\nTable of Contents\\n\\nNote 7. Property and Equipment' metadata={'source': 'data/output.md', '_id': '1243fe46517745f19824a921970b7315', '_collection_name': 'meta_10k'}\n", "page_content='Cash Provided by Operating Activities\\n\\nCash provided by operating activities during 2023 mostly consisted of $39.10 billion net income adjusted for certain non-cash items, such as $14.03 billion of share-based compensation expense and $11.18 billion of depreciation and amortization expense, as well as $3.29 billion of favorable changes in working capital. The increase in cash flows from operating activities during 2023 compared to 2022 was mostly due to an increase in cash collection from our customers driven by the increase in revenue, and a decrease in payments to our vendors.\\n\\nCash Used in Investing Activities\\n\\nCash used in investing activities during 2023 mostly consisted of $27.05 billion of net purchases of property and equipment as we continued to invest in data centers, servers, and network infrastructure, partially offset by $3.20 billion net proceeds from maturities and sales of marketable debt securities. The decrease in cash used in investing activities during 2023 compared to 2022 was mostly due to a decrease in purchases of property and equipment.\\n\\nWe anticipate making capital expenditures of approximately $30 billion to $37 billion in 2024.\\n\\nCash Used in Financing Activities\\n\\nCash used in financing activities during 2023 mostly consisted of $19.77 billion for repurchases of our Class A common stock and $7.01 billion of taxes paid related to net share settlement of RSUs, partially offset by $8.46 billion proceeds from the issuance of the Notes in May 2023. The decrease in cash used in financing activities during 2023 compared to 2022 was mainly due to a decrease in cash paid for repurchases of our Class A common stock, partially offset by an increase in taxes paid related to net share settlement of employee RSU awards and a decrease in net proceeds from our debt offerings.\\n\\nFree Cash Flow\\n\\nIn addition to other financial measures presented in accordance with U.S.\\n\\nMeta Platforms, Inc. Annual Report\\n\\nTable of Contents' metadata={'source': 'data/output.md', '_id': 'd7253455de1f416aa1828818a128613e', '_collection_name': 'meta_10k'}\n", "page_content='Year Operating Leases Finance Leases 2024 $2,219 $111 2025 $2,330 $64 2026 $2,264 $64 2027 $2,233 $60 2028 $2,112 $60 Thereafter $12,491 $492 Total undiscounted cash flows $23,649 $851 Less: Imputed interest ($4,800) ($161) Present value of lease liabilities (1) $18,849 $690\\n\\nLease liabilities, current: $1,623 (2023) and $90 (December 31, 2022)\\n\\nLease liabilities, non-current: $17,226 (2023) and $600 (December 31, 2022)\\n\\nPresent value of lease liabilities (1): $18,849 (2023) and $690 (December 31, 2022)\\n\\n(1) Lease liabilities include operating leases under restructuring as a part of our facilities consolidation efforts. For additional information, see Note 3 — Restructuring.\\n\\nThe table above does not include lease payments that were not fixed at commencement or lease modification. As of December 31, 2023, we have additional operating and finance leases, that have not yet commenced, with lease obligations of approximately $7.07 billion and $1.37 billion, respectively, mostly for data centers, colocations, and network infrastructure. These operating and finance leases will commence between 2024 and 2028 with lease terms of greater than one year to 30 years.\\n\\nSignature Table:\\n\\nName Title Date [Signature 1] [Title 1] [Date 1] [Signature 2] [Title 2] [Date 2] --- # Meta Platforms, Inc. - Annual Report\\n\\nTable of Contents\\n\\nYear Ended December 31 2023 2022 2021 Cash paid for amounts included in the measurement of lease liabilities: Operating cash flows for operating leases (1) $2,233 $1,654 $1,406 Operating cash flows for finance leases $20 $16 $15 Financing cash flows for finance leases $1,058 $850 $677 Lease liabilities arising from obtaining right-of-use assets: Operating leases $4,370 $4,366 $4,466 Finance leases $588 $223 $160\\n\\n(1) Cash flows for operating leases during the year ended December 31, 2023 include cash paid for terminations of certain operating leases.\\n\\nNote 9. Acquisitions, Goodwill, and Intangible Assets' metadata={'source': 'data/output.md', '_id': '2c39dbf8b4a742ee9e1a2890783e50bf', '_collection_name': 'meta_10k'}\n", "page_content='106\\n\\nMeta Platforms, Inc. - Financial Instruments\\n\\nFinancial Instruments\\n\\nWe classify our cash equivalents and marketable debt securities within Level 1 or Level 2 because we use quoted market prices or alternative pricing sources and models utilizing market observable inputs to determine their fair value. Certain other assets are classified within Level 3 because factors used to develop the estimated fair value are unobservable inputs that are not supported by market activity.\\n\\nAssets Measured at Fair Value\\n\\nThe following tables summarize our assets measured at fair value on a recurring basis and the classification by level of input within the fair value hierarchy (in millions):\\n\\nDescription December 31, 2023 Quoted Prices in Active Markets for Identical Assets (Level 1) Significant Observable Inputs (Level 2) Significant Unobservable Inputs (Level 3) Cash $6,265 Cash equivalents: Money market funds $32,910 $32,910 Cash equivalents: U.S. government and agency securities $2,206 $2,206 Cash equivalents: Time deposits $261 $261 Cash equivalents: Corporate debt securities $220 $220 Total cash and cash equivalents $41,862 $35,116 $481 Marketable securities: U.S. government securities $8,439 $8,439 Marketable securities: U.S. government agency securities $3,498 $3,498 Marketable securities: Corporate debt securities $11,604 $11,604 Total marketable securities $23,541 $11,937 $11,604 Restricted cash equivalents $857 $857 Other assets $101 $101 Total $66,361 $47,910 $12,085 $101\\n\\n107\\n\\nMeta Platforms, Inc. - Annual Report\\n\\nTable of Contents\\n\\nSignatures\\n\\nName Title Date [Signature Name 1] [Title 1] [Date 1] [Signature Name 2] [Title 2] [Date 2] --- # Meta Platforms, Inc. - Annual Report\\n\\nTable of Contents\\n\\nContractual Maturities\\n\\nThe following table classifies our marketable debt securities by contractual maturities (in millions):\\n\\nDue within one year $ December 31, 2023 7,120 Due after one year to five years 16,421 Total 23,541' metadata={'source': 'data/output.md', '_id': 'b9799a5c490d4ce0892929f832bd58ba', '_collection_name': 'meta_10k'}\n" ] } ] }, { "cell_type": "code", "source": [ "# query the rag_chain\n", "query2 = \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n", "response = rag_chain.invoke({\"question\": query2})\n", "print(response['response'])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5406199a-66e8-469f-a3ce-b2b417f4215d", "id": "l5KLh5Nh12aN" }, "execution_count": 193, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The members of Meta's Board of Directors are listed as follows in the context provided:\n", "- Mark Zuckerberg\n", "- Susan Li\n", "- Aaron Anderson\n", "- Peggy Alford\n", "- Marc L. Andreessen\n", "- Andrew W. Houston\n", "- Nancy Killefer\n", "- Robert M. Kimmitt\n", "- Sheryl K. Sandberg\n", "- Tracey T. Travis\n", "- Tony Xu\n" ] } ] }, { "cell_type": "code", "source": [ "for context in response['context']:\n", " print(context)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "66754109-9ac7-43c8-d065-60b34173ed50", "id": "yzrKRr3tjpJi" }, "execution_count": 194, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "page_content='Sheryl Sandberg\\nChief Operating Officer\\nMarch 31, 2022\\n\\nMeta Platforms, Inc. - Annual Report\\n\\nTable of Contents\\n\\nItem 1B. Unresolved Staff Comments\\n\\nNone.\\n\\nItem 1C. Cybersecurity\\n\\nAt Meta, cybersecurity risk management is an important part of our overall risk management efforts. Our industry is prone to cybersecurity threats and attacks, and we regularly experience cybersecurity incidents of varying degrees. We believe we are a particularly attractive target as a result of our prominence and scale, the types and volume of personal data and content on our systems, and the evolving nature of our products and services. Our products and services reach billions of users and involve the collection, storage, processing, and transmission of a large amount of data. In addition, our business and operations span numerous geographies around the world, involve thousands of employees, contractors, vendors, developers, partners, and other third parties, and rely on software and hardware that is highly technical and complex. We maintain an information security program that is comprised of policies and controls designed to mitigate cybersecurity risk. However, at any given time, we face known and unknown cybersecurity risks and threats that are not fully mitigated, and we discover vulnerabilities in our program. We continuously work to enhance our information security program and risk management efforts.' metadata={'source': 'data/output.md', '_id': '166a3a79512b4077889df8dc10085852', '_collection_name': 'meta_10k'}\n", "page_content='Stockholders do not have the ability to cumulate votes for the election of directors. Our amended and restated certificate of incorporation and amended and\\nrestated bylaws provide for a classified board of directors consisting of three classes of approximately equal size, each serving staggered three-year terms,\\nwhen the outstanding shares of our Class B common stock represent less than a majority of the combined voting power of common stock.\\n\\nNo Preemptive or Similar Rights: Our common stock is not entitled to preemptive rights and is not subject to conversion, redemption or sinking fund provisions.\\n\\nSignatures\\n|Name|Title|Date|\\n|---|---|---|\\n|[Name]|[Title]|[Date]|\\n|[Name]|[Title]|[Date]|\\n\\nMeta Platforms, Inc. - Annual Report\\n\\nSignatures\\n\\nName\\nTitle\\nDate\\n\\nMark Zuckerberg\\nCEO\\nMarch 1, 2022\\n\\nSheryl Sandberg\\nCOO\\nMarch 1, 2022\\n\\nMeta Platforms, Inc. - Amended and Restated Certificate of Incorporation and Bylaw Provisions\\n\\nAmended and Restated Certificate of Incorporation and Bylaw Provisions\\n\\nOur amended and restated certificate of incorporation and our amended and restated bylaws include a number of provisions that may have the effect of deterring hostile takeovers or delaying or preventing changes in control of our company, even after such time as the shares of our Class B common stock no longer represent a majority of the combined voting power of our common stock.\\n\\nProvisions:\\n\\nSeparate Class B Vote for Certain Transactions.\\n\\nDual Class Stock.\\n\\nSupermajority Approvals.\\n\\nBoard of Directors Vacancies.\\n\\nClassified Board.\\n\\nStockholder Action; Special Meeting of Stockholders.\\n\\nProvision\\nDescription' metadata={'source': 'data/output.md', '_id': '61342de247d44c70a74ead81e8c3e262', '_collection_name': 'meta_10k'}\n", "page_content='Meta Platforms, Inc. - Signatures\\n\\nSignatures\\n\\nExhibit Number\\nExhibit Description\\nForm\\nFiled\\n\\n32.2#\\nCertification of Susan Li, Chief Financial Officer, pursuant to 18 U.S.C. Section 1350, as adopted pursuant to Section 906 of the Sarbanes-Oxley Act of 2002.\\n\\n97.1\\nCompensation Recoupment Policy.\\n\\n101.INS\\nInline XBRL Instance Document (the instance document does not appear in the Interactive Data File because its XBRL tags are embedded within the Inline XBRL document).\\n\\n101.SCH\\nInline XBRL Taxonomy Extension Schema Document.\\n\\n101.CAL\\nInline XBRL Taxonomy Extension Calculation Linkbase Document.\\n\\n101.DEF\\nInline XBRL Taxonomy Extension Definition Linkbase Document.\\n\\n101.LAB\\nInline XBRL Taxonomy Extension Labels Linkbase Document.\\n\\n101.PRE\\nInline XBRL Taxonomy Extension Presentation Linkbase Document.\\n\\n104\\nCover Page Interactive Data File (formatted as inline XBRL and contained in Exhibit 101).\\n\\nDate: February 1, 2024 META PLATFORMS, INC. /s/ Susan Li Susan Li Chief Financial Officer --- Signature Title --- --- /s/ Mark Zuckerberg Board Chair and Chief Executive Officer /s/ Susan Li Chief Financial Officer /S/ Aaron Anderson Chief Accounting Officer /s/ Peggy Alford Director /s/ Marc L. Andreessen Director /s/ Andrew W. Houston Director /s/ Nancy Killefer Director /s/ Robert M. Kimmitt Director /s/ Sheryl K. Sandberg Director /s/ Tracey T. Travis Director /s/ Tony Xu Director ---\\n\\nMeta Platforms, Inc. - Description of Capital Stock\\n\\nDescription of Capital Stock\\n\\nThe following description of capital stock of Meta Platforms, Inc. (the “company,” “we,” “us” and “our”) summarizes certain provisions of our amended\\nand restated certificate of incorporation and our amended and restated bylaws. The description is intended as a summary, and is qualified in its entirety by\\nreference to our amended and restated certificate of incorporation and our amended and restated bylaws, copies of which have been filed as exhibits to this\\nAnnual Report on Form 10-K.' metadata={'source': 'data/output.md', '_id': 'e2a55784464c45019fe5d8962beff567', '_collection_name': 'meta_10k'}\n", "page_content=\"Exhibits\\n|Exhibit Number|Exhibit Description|Form|File No.|Exhibit|Filing Date|Herewith|\\n|---|---|---|---|---|---|---|\\n|3.1|Amended and Restated Certificate of Incorporation.|8-K|001-35551|3.1|October 28, 2021| |\\n|3.2|Amended and Restated Bylaws.|8-K|001-35551|3.2|October 28, 2021| |\\n|4.1|Form of Class A Common Stock Certificate.|10-K|001-35551|4.1|February 3, 2022| |\\n|4.2|Form of Class B Common Stock Certificate.|10-K|001-35551|4.2|February 3, 2022| |\\n|4.3|Indenture, dated as of August 9, 2022, between Meta Platforms, Inc. and U.S. Bank Trust Company, National Association, as trustee.|8-K|001-35551|4.1|August 9, 2022| |\\n|4.4|First Supplemental Indenture, dated as of August 9, 2022, between Meta Platforms, Inc. and U.S. Bank Trust Company, National Association, as trustee.|8-K|001-35551|4.2|August 9, 2022| |\\n|4.5|Second Supplemental Indenture, dated as of May 3, 2023, by and between Meta Platforms, Inc. and U.S. Bank Trust Company, National Association, as trustee.|8-K|001-35551|4.1|May 3, 2023| |\\n|4.6|Description of Registrant's Capital Stock.| | |X| | |\\n|10.1+|Form of Indemnification Agreement.|8-K|001-35551|10.1|April 15, 2019| |\\n|10.2(A)+|2012 Equity Incentive Plan, as amended.|10-K|001-35551|10.2(A)|February 2, 2023| |\\n|10.2(B)+|Third Amendment to the 2012 Equity Incentive Plan.|10-K|001-35551|10.2(B)|February 2, 2023| |\\n|10.2(C)+|2012 Equity Incentive Plan forms of award agreements.|10-Q|001-35551|10.2|July 31, 2012| |\\n|10.2(D)+|2012 Equity Incentive Plan forms of award agreements (Additional Forms).|10-Q|001-35551|10.1|May 4, 2017| |\\n\\nMeta Platforms, Inc. - Signatures\\n\\nSignatures\\n\\nSigner\\nTitle\\nDate\\n\\nMark Zuckerberg\\nChief Executive Officer\\nNot specified\\n\\nSusan Li\\nChief Financial Officer\\nNot specified\\n\\nMark Zuckerberg\\nChief Executive Officer\\nNot specified\\n\\nMeta Platforms, Inc. - Signatures\\n\\nSignatures\\n\\nExhibit Number\\nExhibit Description\\nForm\\nFiled\" metadata={'source': 'data/output.md', '_id': 'a4b504a1de064c60a88cb4d9719f1339', '_collection_name': 'meta_10k'}\n" ] } ] }, { "cell_type": "markdown", "source": [ "## RAG with Llama Parse + Recursive Query Engine + Cohere Reranking" ], "metadata": { "id": "K9ocj7YxcyVm" } }, { "cell_type": "code", "source": [ "!pip install -qU llama-index-postprocessor-cohere-rerank" ], "metadata": { "id": "B0nq65C3c7qW", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "02da173a-486d-43a2-aae2-8f3f180106b1" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.2/151.2 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m59.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "from getpass import getpass\n", "\n", "# set cohere api key\n", "os.environ['COHERE_API_KEY'] = getpass('Cohere API Key:')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Of-OSTMMdzvi", "outputId": "cd2f62f6-1bc0-4713-bc59-7490e6bb4d4f" }, "execution_count": 7, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cohere API Key:··········\n" ] } ] }, { "cell_type": "code", "source": [ "# setup default llm and embedding model for the rag pipeline\n", "from llama_index.llms.openai import OpenAI\n", "from llama_index.embeddings.openai import OpenAIEmbedding\n", "from llama_index.core import Settings\n", "\n", "Settings.llm = OpenAI(model=\"gpt-3.5-turbo\")\n", "Settings.embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")" ], "metadata": { "id": "BCeaLJBHxpn1" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "# import dependencies\n", "from llama_parse import LlamaParse\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.core.node_parser import MarkdownElementNodeParser\n", "from llama_index.postprocessor.cohere_rerank import CohereRerank\n", "\n", "# parsing_instruction = \"\"\"The provided document is an annual report filed by Meta Platforms, Inc. with the Securities and Exchange Commission (SEC).\n", "# This form provides detailed financial information about the company's performance for a specific year.\n", "# It includes financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.\n", "# It contains many tables.\n", "\n", "# In the Power of Attorney section of this document, there is a signature table. Make sure to extract all the data from that signature table. Remove all of the '/s/' and '/s' signature prefixes and create a markdown table including the headers 'Signature', 'Title' and 'Date'.\n", "# \"\"\"\n", "\n", "parsing_instruction = \"\"\"The provided document is an annual report filed by Meta Platforms, Inc. with the Securities and Exchange Commission (SEC).\n", "This form provides detailed financial information about the company's performance for a specific year.\n", "It includes financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.\n", "It contains many tables and some signature pages.\n", "\n", "Extract the signatures as a table.\n", "\"\"\"\n", "\n", "# parsing_instruction = \"\"\"The provided document is an annual report filed by Meta Platforms, Inc. with the Securities and Exchange Commission (SEC).\n", "# This form provides detailed financial information about the company's performance for a specific year.\n", "# It includes financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.\n", "# It contains many tables.\n", "\n", "# There is a signature page inside this document.\n", "# Remove all of the '/s/' and '/s' signature prefixes and create a table from that page, with the headers 'Signature', 'Title' and 'Date'.\n", "# \"\"\"\n", "\n", "# setup parser\n", "parser = LlamaParse(\n", " result_type=\"markdown\",\n", " parsing_instruction=parsing_instruction\n", ")\n", "\n", "# load and parse the documet\n", "file_extractor = {\".pdf\": parser}\n", "documents = SimpleDirectoryReader(\n", " input_files=['data/Meta_10k.pdf'],\n", " file_extractor=file_extractor\n", ").load_data()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c34ab441-4791-4c32-fe72-4610f5d78fd3", "id": "7JoCZGK3cyVn" }, "execution_count": 96, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Started parsing the file under job_id 2c38731c-6e9c-41aa-8fb8-007d54d2d2d8\n", "...." ] } ] }, { "cell_type": "code", "source": [ "# save the parsed markdown document\n", "# save markdown file\n", "data_file = \"./data/output.md\"\n", "with open(data_file, \"a\") as f:\n", " for doc in documents:\n", " f.write(doc.text + '\\n')" ], "metadata": { "id": "wRfod7Gg3DFT" }, "execution_count": 97, "outputs": [] }, { "cell_type": "code", "source": [ "target_page = 132\n", "print(documents[0].text.split(\"\\n---\\n\")[target_page])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5oEEUY45pEA2", "outputId": "b41e659d-5990-40c1-f7a7-fbab32eee7da" }, "execution_count": 98, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "|Signature|Title|Date|\n", "|---|---|---|\n", "|/s/ Mark Zuckerberg|Board Chair and Chief Executive Officer|February 1, 2024|\n", "|/s/ Susan Li|Chief Financial Officer|February 1, 2024|\n", "|/S/ Aaron Anderson|Chief Accounting Officer|February 1, 2024|\n", "|/s/ Peggy Alford|Director|February 1, 2024|\n", "|/s/ Marc L. Andreessen|Director|February 1, 2024|\n", "|/s/ Andrew W. Houston|Director|February 1, 2024|\n", "|/s/ Nancy Killefer|Director|February 1, 2024|\n", "|/s/ Robert M. Kimmitt|Director|February 1, 2024|\n", "|/s/ Sheryl K. Sandberg|Director|February 1, 2024|\n", "|/s/ Tracey T. Travis|Director|February 1, 2024|\n", "|/s/ Tony Xu|Director|February 1, 2024|\n" ] } ] }, { "cell_type": "code", "source": [ "# setup markdown node parser\n", "node_parser = MarkdownElementNodeParser(llm=OpenAI(model=\"gpt-3.5-turbo\"), num_workers=8)\n", "\n", "# parse the mardown document\n", "nodes = node_parser.get_nodes_from_documents(documents)\n", "base_nodes, objects = node_parser.get_nodes_and_objects(nodes)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wn96-5zb21BV", "outputId": "537e44a1-1904-4561-e85a-8c8337d10883" }, "execution_count": 99, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "140it [00:00, 14999.94it/s]\n", " 0%| | 0/140 [00:00: RelatedNodeInfo(node_id='c048b20b-fd07-44df-b07b-462f0c623390', node_type=, metadata={'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='c3c23ef0027388046e3d5392d01653102bc476918c76dc19aa33c0b3b483a654'), : RelatedNodeInfo(node_id='4c310d89-5adc-4067-b4db-6d1ad0b9df13', node_type=, metadata={'table_df': \"{'Depreciation and amortization': {0: 'Right-of-use assets', 1: 'Total deferred tax liabilities', 2: 'Net deferred tax assets'}, '($8,320)': {0: '($2,708)', 1: '($11,028)', 2: '$4,864'}, '($6,296)': {0: '($2,555)', 1: '($8,851)', 2: '$4,946'}}\", 'table_summary': 'This table presents information on depreciation and amortization, right-of-use assets, total deferred tax liabilities, and net deferred tax assets.,\\nwith the following table title:\\nDepreciation and Amortization and Deferred Tax Assets Table,\\nwith the following columns:\\n- Depreciation and amortization: Values related to depreciation and amortization\\n- Right-of-use assets: Values related to right-of-use assets\\n- Total deferred tax liabilities: Values related to total deferred tax liabilities\\n- Net deferred tax assets: Values related to net deferred tax assets\\n', 'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='6ac748a27e957c3d882a4a9d07e8358891ba77874bbb5a1f4c0a10b25fc297fe'), : RelatedNodeInfo(node_id='ebc054f8-837f-4054-86f2-309fbf0802f0', node_type=, metadata={'col_schema': 'Column: Year Ended December 31\\nType: Numeric\\nSummary: The amounts of gross unrecognized tax benefits at the beginning and end of each year, along with increases and decreases related to prior and current year tax positions as well as settlements of prior year tax positions.', 'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='ae0f2e3cf0e4d13e3f030868d57328b8d952399b506069a3e34a6504ee834255')}, text='The valuation allowance was approximately $2.88 billion and $2.49 billion as of December 31, 2023 and 2022, respectively, primarily related to U.S. state tax credit carryforwards, U.S. foreign tax credits, unrealized losses in marketable securities, and certain foreign tax attributes for which we do not believe a tax benefit is more likely than not to be realized.\\n\\nAs of December 31, 2023, the U.S. federal and state net operating loss carryforwards were $200 million and $2.78 billion, which will begin to expire in 2035 and 2031, respectively, if not utilized. We have federal tax credit carryforwards of $490 million, which will begin to expire in 2029, if not utilized, and state tax credit carryforwards of $4.08 billion, most of which do not expire.\\n\\nUtilization of our net operating loss and tax credit carryforwards may be subject to substantial annual limitations due to the ownership change limitations provided by the Internal Revenue Code and similar state provisions. Such annual limitations could result in the expiration of the net operating loss and tax credit carryforwards before their utilization. The events that may cause ownership changes include, but are not limited to, a cumulative stock ownership change of greater than 50% over a three-year period.\\n\\nThe following table reflects changes in the gross unrecognized tax benefits (in millions):', start_char_idx=237138, end_char_idx=238508, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')" ] }, "metadata": {}, "execution_count": 120 } ] }, { "cell_type": "code", "source": [ "base_nodes[128].text" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 122 }, "id": "cYPPBcsOrzqV", "outputId": "17b0b28c-9348-434b-b6fd-5ac9495f6fca" }, "execution_count": 121, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'The valuation allowance was approximately $2.88 billion and $2.49 billion as of December 31, 2023 and 2022, respectively, primarily related to U.S. state tax credit carryforwards, U.S. foreign tax credits, unrealized losses in marketable securities, and certain foreign tax attributes for which we do not believe a tax benefit is more likely than not to be realized.\\n\\nAs of December 31, 2023, the U.S. federal and state net operating loss carryforwards were $200 million and $2.78 billion, which will begin to expire in 2035 and 2031, respectively, if not utilized. We have federal tax credit carryforwards of $490 million, which will begin to expire in 2029, if not utilized, and state tax credit carryforwards of $4.08 billion, most of which do not expire.\\n\\nUtilization of our net operating loss and tax credit carryforwards may be subject to substantial annual limitations due to the ownership change limitations provided by the Internal Revenue Code and similar state provisions. Such annual limitations could result in the expiration of the net operating loss and tax credit carryforwards before their utilization. The events that may cause ownership changes include, but are not limited to, a cumulative stock ownership change of greater than 50% over a three-year period.\\n\\nThe following table reflects changes in the gross unrecognized tax benefits (in millions):'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 121 } ] }, { "cell_type": "code", "source": [ "len(objects)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qLwTUZbxgSeo", "outputId": "9e077c07-a336-4c6f-ee48-e53362624a04" }, "execution_count": 122, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "140" ] }, "metadata": {}, "execution_count": 122 } ] }, { "cell_type": "code", "source": [ "objects[126]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "csa8rxbbgmvM", "outputId": "bf943ef7-69e4-4a69-f2e2-f870f7671798" }, "execution_count": 134, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "IndexNode(id_='c61cab3e-87f1-44c8-a80e-2abc77f79c27', embedding=None, metadata={'col_schema': 'Column: Depreciation and amortization\\nType: Numeric\\nSummary: Values related to depreciation and amortization\\n\\nColumn: Right-of-use assets\\nType: Numeric\\nSummary: Values related to right-of-use assets\\n\\nColumn: Total deferred tax liabilities\\nType: Numeric\\nSummary: Values related to total deferred tax liabilities\\n\\nColumn: Net deferred tax assets\\nType: Numeric\\nSummary: Values related to net deferred tax assets', 'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, excluded_embed_metadata_keys=['col_schema'], excluded_llm_metadata_keys=[], relationships={: RelatedNodeInfo(node_id='c048b20b-fd07-44df-b07b-462f0c623390', node_type=, metadata={'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='c3c23ef0027388046e3d5392d01653102bc476918c76dc19aa33c0b3b483a654'), : RelatedNodeInfo(node_id='8b957ced-7a2b-4ecd-a286-1da9a1a6715e', node_type=, metadata={'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='f477f66812b416c834b1e402ec7525afcd744138eedca29eff7a25b960599b31'), : RelatedNodeInfo(node_id='4c310d89-5adc-4067-b4db-6d1ad0b9df13', node_type=, metadata={'table_df': \"{'Depreciation and amortization': {0: 'Right-of-use assets', 1: 'Total deferred tax liabilities', 2: 'Net deferred tax assets'}, '($8,320)': {0: '($2,708)', 1: '($11,028)', 2: '$4,864'}, '($6,296)': {0: '($2,555)', 1: '($8,851)', 2: '$4,946'}}\", 'table_summary': 'This table presents information on depreciation and amortization, right-of-use assets, total deferred tax liabilities, and net deferred tax assets.,\\nwith the following table title:\\nDepreciation and Amortization and Deferred Tax Assets Table,\\nwith the following columns:\\n- Depreciation and amortization: Values related to depreciation and amortization\\n- Right-of-use assets: Values related to right-of-use assets\\n- Total deferred tax liabilities: Values related to total deferred tax liabilities\\n- Net deferred tax assets: Values related to net deferred tax assets\\n', 'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='6ac748a27e957c3d882a4a9d07e8358891ba77874bbb5a1f4c0a10b25fc297fe')}, text='This table presents information on depreciation and amortization, right-of-use assets, total deferred tax liabilities, and net deferred tax assets.,\\nwith the following table title:\\nDepreciation and Amortization and Deferred Tax Assets Table,\\nwith the following columns:\\n- Depreciation and amortization: Values related to depreciation and amortization\\n- Right-of-use assets: Values related to right-of-use assets\\n- Total deferred tax liabilities: Values related to total deferred tax liabilities\\n- Net deferred tax assets: Values related to net deferred tax assets\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n', index_id='4c310d89-5adc-4067-b4db-6d1ad0b9df13', obj=TextNode(id_='4c310d89-5adc-4067-b4db-6d1ad0b9df13', embedding=None, metadata={'table_df': \"{'Depreciation and amortization': {0: 'Right-of-use assets', 1: 'Total deferred tax liabilities', 2: 'Net deferred tax assets'}, '($8,320)': {0: '($2,708)', 1: '($11,028)', 2: '$4,864'}, '($6,296)': {0: '($2,555)', 1: '($8,851)', 2: '$4,946'}}\", 'table_summary': 'This table presents information on depreciation and amortization, right-of-use assets, total deferred tax liabilities, and net deferred tax assets.,\\nwith the following table title:\\nDepreciation and Amortization and Deferred Tax Assets Table,\\nwith the following columns:\\n- Depreciation and amortization: Values related to depreciation and amortization\\n- Right-of-use assets: Values related to right-of-use assets\\n- Total deferred tax liabilities: Values related to total deferred tax liabilities\\n- Net deferred tax assets: Values related to net deferred tax assets\\n', 'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, excluded_embed_metadata_keys=['table_df', 'table_summary'], excluded_llm_metadata_keys=['table_df', 'table_summary'], relationships={: RelatedNodeInfo(node_id='c048b20b-fd07-44df-b07b-462f0c623390', node_type=, metadata={'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='c3c23ef0027388046e3d5392d01653102bc476918c76dc19aa33c0b3b483a654'), : RelatedNodeInfo(node_id='c61cab3e-87f1-44c8-a80e-2abc77f79c27', node_type=, metadata={'col_schema': 'Column: Depreciation and amortization\\nType: Numeric\\nSummary: Values related to depreciation and amortization\\n\\nColumn: Right-of-use assets\\nType: Numeric\\nSummary: Values related to right-of-use assets\\n\\nColumn: Total deferred tax liabilities\\nType: Numeric\\nSummary: Values related to total deferred tax liabilities\\n\\nColumn: Net deferred tax assets\\nType: Numeric\\nSummary: Values related to net deferred tax assets', 'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='0c82604c0dff64077628e855a8f7d6092fd3f504a49096d979342f3ef3fc3aff'), : RelatedNodeInfo(node_id='942c6da5-6ede-4257-84aa-751bf9325af2', node_type=, metadata={'file_path': 'data/Meta_10k.pdf', 'file_name': 'Meta_10k.pdf', 'file_type': 'application/pdf', 'file_size': 2481466, 'creation_date': '2024-05-02', 'last_modified_date': '2024-02-02'}, hash='3ab5054425d49720f40f1a66f9db9db2be2bd79777e2d6482f466e8bdb260842')}, text='This table presents information on depreciation and amortization, right-of-use assets, total deferred tax liabilities, and net deferred tax assets.,\\nwith the following table title:\\nDepreciation and Amortization and Deferred Tax Assets Table,\\nwith the following columns:\\n- Depreciation and amortization: Values related to depreciation and amortization\\n- Right-of-use assets: Values related to right-of-use assets\\n- Total deferred tax liabilities: Values related to total deferred tax liabilities\\n- Net deferred tax assets: Values related to net deferred tax assets\\n\\n|Depreciation and amortization|($8,320)|($6,296)|\\n|---|---|---|\\n|Right-of-use assets|($2,708)|($2,555)|\\n|Total deferred tax liabilities|($11,028)|($8,851)|\\n|Net deferred tax assets|$4,864|$4,946|\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'))" ] }, "metadata": {}, "execution_count": 134 } ] }, { "cell_type": "code", "source": [ "objects[120].text" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "vzwJSRxnuEOD", "outputId": "ad749c6c-897e-443d-b41e-182afa5428f5" }, "execution_count": 123, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\"The table contains information about signatures, titles, and dates. The existing table id is 'id_1048'. The table should be kept.,\\nwith the following columns:\\n\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 123 } ] }, { "cell_type": "code", "source": [ "objects[120].obj.metadata" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "26664e65-b077-4023-d4b8-beeec5e0e751", "id": "nH8ta8ZEuW-W" }, "execution_count": 126, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'table_df': \"{'Name': {0: '[Signature Name 1]', 1: '[Signature Name 2]'}, 'Title': {0: '[Title 1]', 1: '[Title 2]'}, 'Date': {0: '[Date 1]', 1: '[Date 2]'}}\",\n", " 'table_summary': \"The table contains information about signatures, titles, and dates. The existing table id is 'id_1048'. The table should be kept.,\\nwith the following columns:\\n\",\n", " 'file_path': 'data/Meta_10k.pdf',\n", " 'file_name': 'Meta_10k.pdf',\n", " 'file_type': 'application/pdf',\n", " 'file_size': 2481466,\n", " 'creation_date': '2024-05-02',\n", " 'last_modified_date': '2024-02-02'}" ] }, "metadata": {}, "execution_count": 126 } ] }, { "cell_type": "code", "source": [ "# setup reranker\n", "cohere_rerank = CohereRerank(top_n=5)\n", "\n", "# create an index from the parsed markdown\n", "index = VectorStoreIndex(nodes=base_nodes+objects)\n", "\n", "# create a new query engine for the index\n", "recursive_query_engine = index.as_query_engine(\n", " similarity_top_k=15,\n", " node_postprocessors=[cohere_rerank],\n", " verbose=True\n", ")" ], "metadata": { "id": "KUE2IWwqpbDY" }, "execution_count": 135, "outputs": [] }, { "cell_type": "code", "source": [ "# query the engine\n", "query1 = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n", "response = recursive_query_engine.query(query1)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f511f92a-e9de-49f8-eba1-fff061b872c9", "id": "rfak5dxTcyVn" }, "execution_count": 136, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[1;3;38;2;11;159;203mRetrieval entering 590db1d7-47a6-444a-b315-97704bc04138: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 72475249-f252-4d2a-86a9-07d7ea48160b: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering b9e72c08-6383-43e3-8bb9-0d6b3e60225a: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering b8050b7f-1e3c-495b-b12d-5d16364edfdd: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 1df1d1ae-2a0c-4ca4-bbd4-847563a0fc91: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 9ed27db3-7d2e-4677-b915-204b55b84b77: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering cecae625-4d5c-47e6-a779-56e32aa9b702: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 570dc16c-b47f-4708-93de-25b5ad128a22: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\n", "\u001b[0mThe total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862 million.\n" ] } ] }, { "cell_type": "code", "source": [ "# query the engine\n", "query2 = \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n", "response = recursive_query_engine.query(query2)\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "63d920f4-7926-4281-fe65-e6d6db37e48d", "id": "quC4L7LWcyVn" }, "execution_count": 143, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[1;3;38;2;11;159;203mRetrieval entering ac8366a1-9e82-4b92-8e74-3d577bcd8d0c: TextNode\n", "\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\n", "\u001b[0mThe members of Meta's Board of Directors are not explicitly mentioned in the provided context information.\n" ] } ] }, { "cell_type": "code", "source": [ "from google.colab import files\n", "parsed_md = files.upload()" ], "metadata": { "id": "cpLaKyKZiIfk", "colab": { "base_uri": "https://localhost:8080/", "height": 73 }, "outputId": "502eb9b5-fad2-4363-c0d9-77d746028514" }, "execution_count": 93, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving output.md to output.md\n" ] } ] }, { "cell_type": "markdown", "source": [ "## RAG with Llama Parse + Recursive Query Engine + FlagEmbedding Reranker (powered by BAAI/bge-reranker-large)" ], "metadata": { "id": "FzsRbuujpEAp" } }, { "cell_type": "code", "source": [ "!pip install -qU llama-index-postprocessor-flag-embedding-reranker git+https://github.com/FlagOpen/FlagEmbedding.git" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "057yhUgark4r", "outputId": "fe0e5042-f0da-4542-8e7d-6049af350fdc" }, "execution_count": 41, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m542.0/542.0 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m297.6/297.6 kB\u001b[0m \u001b[31m26.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m171.5/171.5 kB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m25.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m388.9/388.9 kB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Building wheel for FlagEmbedding (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ] }, { "cell_type": "code", "source": [ "# setup default llm and embedding model for the rag pipeline\n", "from llama_index.llms.openai import OpenAI\n", "from llama_index.embeddings.openai import OpenAIEmbedding\n", "from llama_index.core import Settings\n", "\n", "Settings.llm = OpenAI(model=\"gpt-3.5-turbo\")\n", "Settings.embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")" ], "metadata": { "id": "TzgamDtZpEA1" }, "execution_count": 39, "outputs": [] }, { "cell_type": "code", "source": [ "# import dependencies\n", "from llama_parse import LlamaParse\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.core.node_parser import MarkdownElementNodeParser\n", "\n", "parsing_instruction = \"\"\"The provided document is an annual report filed by Meta Platforms,\n", " Inc. with the Securities and Exchange Commission (SEC).\n", " This form provides detailed financial information about the company's performance for a specific year.\n", " It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.\n", " It contains many tables.\n", " Try to be precise while answering the questions\"\"\"\n", "\n", "# setup parser\n", "parser = LlamaParse(\n", " result_type=\"markdown\",\n", " parsing_instruction=parsing_instruction\n", ")\n", "\n", "# load and parse the documet\n", "file_extractor = {\".pdf\": parser}\n", "documents = SimpleDirectoryReader(\n", " input_files=['data/Meta_10k.pdf'],\n", " file_extractor=file_extractor\n", ").load_data()\n", "\n", "# setup markdown node parser\n", "node_parser = MarkdownElementNodeParser(llm=OpenAI(model=\"gpt-3.5-turbo\"), num_workers=8)\n", "\n", "# parse the mardown document\n", "nodes = node_parser.get_nodes_from_documents(documents)\n", "base_nodes, objects = node_parser.get_nodes_and_objects(nodes)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "bc8e7e7a-f43b-4a23-db25-ce229e28add9", "id": "ZsCBl484pEA1" }, "execution_count": 40, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Started parsing the file under job_id 4afdad10-d3c2-4cc9-9dd2-d5678e00a872\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "65it [00:00, 19248.08it/s]\n", " 0%| | 0/65 [00:00