{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "iygCo-AveBAp" ] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "gpuClass": "standard", "widgets": { "application/vnd.jupyter.widget-state+json": { "d27f2d2c41cd4be8a9aa9a6c2af13fb9": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_11025864fbeb430a893b1de80faabcf9", "IPY_MODEL_200671d2a7c247e0a01442eb125d5ce1", "IPY_MODEL_435848b38f7044de8e83eb42f52e9229" ], "layout": "IPY_MODEL_8fa288aef0ad48cb808283829b085412" } }, "11025864fbeb430a893b1de80faabcf9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_de073c24ab684b4398c1bf087e98c149", "placeholder": "​", "style": "IPY_MODEL_4707bd9060944115890cffcb1c04c997", "value": "Downloading: 100%" } }, "200671d2a7c247e0a01442eb125d5ce1": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_671e4645df0d41cc9694017c080708a3", "max": 665, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_765955da0dda49b29b716228a2b4ddf8", "value": 665 } }, "435848b38f7044de8e83eb42f52e9229": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b16a39477ca64800bf248a37f3dcfd2e", "placeholder": "​", "style": "IPY_MODEL_cf59a8595c884352a8296abcf38f5156", "value": " 665/665 [00:00<00:00, 25.1kB/s]" } }, "8fa288aef0ad48cb808283829b085412": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "de073c24ab684b4398c1bf087e98c149": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4707bd9060944115890cffcb1c04c997": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "671e4645df0d41cc9694017c080708a3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "765955da0dda49b29b716228a2b4ddf8": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b16a39477ca64800bf248a37f3dcfd2e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cf59a8595c884352a8296abcf38f5156": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c0706b24a01e48b0adc722fd01152c53": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_1c0ee2a664d64c2bb269b12b7e2a39f7", "IPY_MODEL_9dc23485fb6b488da5e8f59db8512227", "IPY_MODEL_6699d1dc6d034aa28ddf29dcc666ef3d" ], "layout": "IPY_MODEL_1627eddbc0c741239b6e81734b6c1797" } }, "1c0ee2a664d64c2bb269b12b7e2a39f7": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_429ef15df30d4e3e9096297d09a56317", "placeholder": "​", "style": "IPY_MODEL_4f673f201aa84e8c9bf9daca8b3ca7e4", "value": "Downloading: 100%" } }, "9dc23485fb6b488da5e8f59db8512227": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a938011e05c04cb2a95e98461621aff8", "max": 1042301, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_11ebe9e92fa547a6b90534524bf26216", "value": 1042301 } }, "6699d1dc6d034aa28ddf29dcc666ef3d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b7b4ad2048b0408b976e24bf43923c8a", "placeholder": "​", "style": "IPY_MODEL_1602a2cf6cc841f4a186f4000cf322a2", "value": " 1.04M/1.04M [00:00<00:00, 1.11MB/s]" } }, "1627eddbc0c741239b6e81734b6c1797": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "429ef15df30d4e3e9096297d09a56317": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4f673f201aa84e8c9bf9daca8b3ca7e4": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a938011e05c04cb2a95e98461621aff8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "11ebe9e92fa547a6b90534524bf26216": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b7b4ad2048b0408b976e24bf43923c8a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1602a2cf6cc841f4a186f4000cf322a2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "66f5232c274846718e559ccf1f51136a": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b43c9343b8b34d988a3f61eb35b4bcb4", "IPY_MODEL_c7ff7c3e69f14dcfaa17a298dbc8f960", "IPY_MODEL_43c09ba7df694bfe962f2bc97643e709" ], "layout": "IPY_MODEL_9e3a41f9a95a4d2fba31ad6c59bbe39f" } }, "b43c9343b8b34d988a3f61eb35b4bcb4": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_00129ca9392d479389870f953ee21114", "placeholder": "​", "style": "IPY_MODEL_55ff136de2e74e30b940d6bcead779b7", "value": "Downloading: 100%" } }, "c7ff7c3e69f14dcfaa17a298dbc8f960": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_78334aa6adbc4c8393b105901c476459", "max": 456318, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_af51e32471554a4e965fbbc570e9c464", "value": 456318 } }, "43c09ba7df694bfe962f2bc97643e709": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_08321d2b90d645aa89c697ee96ddc17a", "placeholder": "​", "style": "IPY_MODEL_387b5e274f304d38825e84b6a46e884e", "value": " 456k/456k [00:00<00:00, 1.05MB/s]" } }, "9e3a41f9a95a4d2fba31ad6c59bbe39f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "00129ca9392d479389870f953ee21114": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "55ff136de2e74e30b940d6bcead779b7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "78334aa6adbc4c8393b105901c476459": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "af51e32471554a4e965fbbc570e9c464": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "08321d2b90d645aa89c697ee96ddc17a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "387b5e274f304d38825e84b6a46e884e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e8a4e1e0c7334684bfb36c6eefee8bb3": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_96d393d7106b4dd08840c284998ae9f4", "IPY_MODEL_4ff04b8ae6d2456ca0be2280454cb2c7", "IPY_MODEL_298c5f7f1deb49309862093974301d16" ], "layout": "IPY_MODEL_3cf6eb1885734b549eba98918802632a" } }, "96d393d7106b4dd08840c284998ae9f4": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_350463df10b74730bef0bfbd3c3ced79", "placeholder": "​", "style": "IPY_MODEL_561e57826b0f442fb3d4def7ae5843ae", "value": "Downloading: 100%" } }, "4ff04b8ae6d2456ca0be2280454cb2c7": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_aae93142cdf6434cbbcca9cb8a5111fe", "max": 1355256, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_9bc27535b6ce4a509eaedd57430574ea", "value": 1355256 } }, "298c5f7f1deb49309862093974301d16": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_cea7abb1d8544c57b769a4f4381a5069", "placeholder": "​", "style": "IPY_MODEL_ce0cc3eb80014b909b8c96672184eca2", "value": " 1.36M/1.36M [00:00<00:00, 1.41MB/s]" } }, "3cf6eb1885734b549eba98918802632a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "350463df10b74730bef0bfbd3c3ced79": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "561e57826b0f442fb3d4def7ae5843ae": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "aae93142cdf6434cbbcca9cb8a5111fe": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9bc27535b6ce4a509eaedd57430574ea": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "cea7abb1d8544c57b769a4f4381a5069": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ce0cc3eb80014b909b8c96672184eca2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "56da3a98b0574eeb9ad67adfa4edad13": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7ada9dc05146420a9e195c1d766f612c", "IPY_MODEL_04a73d69850e41db98de0cfdb37e5374", "IPY_MODEL_f35f3043a6a34e549f25ceccc1153778" ], "layout": "IPY_MODEL_5fc8f14e4d834eaba0e74728702c3c57" } }, "7ada9dc05146420a9e195c1d766f612c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5463b5b673004ea081304e058ec80f56", "placeholder": "​", "style": "IPY_MODEL_9e71ef47c09f4cbb9999c4a50f5df962", "value": "Downloading: 100%" } }, "04a73d69850e41db98de0cfdb37e5374": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_531137e7bbad40749e1f7e1a1d9d4528", "max": 548118077, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_54ad9667bc8b49ae9c966f58fce35d5d", "value": 548118077 } }, "f35f3043a6a34e549f25ceccc1153778": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5026e7aa6f2640b7bbc3218408de5b40", "placeholder": "​", "style": "IPY_MODEL_f30b04de150841738b3d5de82c178d7d", "value": " 548M/548M [00:16<00:00, 34.2MB/s]" } }, "5fc8f14e4d834eaba0e74728702c3c57": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5463b5b673004ea081304e058ec80f56": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9e71ef47c09f4cbb9999c4a50f5df962": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "531137e7bbad40749e1f7e1a1d9d4528": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "54ad9667bc8b49ae9c966f58fce35d5d": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "5026e7aa6f2640b7bbc3218408de5b40": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f30b04de150841738b3d5de82c178d7d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "# Start with downloading & converting PDF transcripts to txt files for the models to read" ], "metadata": { "id": "iygCo-AveBAp" } }, { "cell_type": "code", "source": [ "!pip install pdfminer.six" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fiPNMmk1e1gw", "outputId": "1542bb66-e9cc-4b3e-ffc4-f499936ca598" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: pdfminer.six in /usr/local/lib/python3.9/dist-packages (20221105)\n", "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.9/dist-packages (from pdfminer.six) (39.0.2)\n", "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from pdfminer.six) (3.1.0)\n", "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.9/dist-packages (from cryptography>=36.0.0->pdfminer.six) (1.15.1)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.9/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six) (2.21)\n" ] } ] }, { "cell_type": "code", "source": [ "from bs4 import BeautifulSoup" ], "metadata": { "id": "pwritVhEgkam" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "vC3veizYd9u2" }, "outputs": [], "source": [ "import requests\n", "import io\n", "import os\n", "from pdfminer.high_level import extract_text" ] }, { "cell_type": "code", "source": [ "# URL of the website with the PDF files\n", "url = 'https://readthatpodcast.com/'" ], "metadata": { "id": "uZzi40_2eLN-" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# Create a new folder on desktop to save PDF files\n", "desktop_path = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n", "folder = os.path.join(desktop_path, \"hubermantranscript1\")\n", "if not os.path.exists(folder):\n", " os.makedirs(folder)" ], "metadata": { "id": "eQYJFuW1eNxv" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "# Download all the PDF files from the website\n", "response = requests.get('https://readthatpodcast.com')\n", "soup = BeautifulSoup(response.content, 'html.parser')\n", "for link in soup.find_all('a', href=True):\n", " href = link['href']\n", " if 'pdf' in href:\n", " episode_name = link.text.strip()\n", " filename = f\"{episode_name}.pdf\"\n", " file_path = os.path.join(folder, filename)\n", " with open(file_path, 'wb') as f:\n", " f.write(requests.get(url + '/' + href).content)" ], "metadata": { "id": "DHqF1U7mjSmY" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the PDF files to text\n", "for filename in os.listdir(folder):\n", " if filename.endswith('.pdf'):\n", " file_path = os.path.join(folder, filename)\n", " with open(file_path, 'rb') as f:\n", " text = extract_text(io.BytesIO(f.read()))\n", " text_path = os.path.join(folder, filename.replace('.pdf', '.txt'))\n", " with open(text_path, 'w') as f:\n", " f.write(text)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 395 }, "id": "kM09gzR4eWTv", "outputId": "d884308c-da99-4937-ebc8-26f1a1487243" }, "execution_count": 19, "outputs": [ { "output_type": "error", "ename": "PDFSyntaxError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mPDFSyntaxError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mfile_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBytesIO\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mtext_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.pdf'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/pdfminer/high_level.py\u001b[0m in \u001b[0;36mextract_text\u001b[0;34m(pdf_file, password, page_numbers, maxpages, caching, codec, laparams)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[0minterpreter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPDFPageInterpreter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrsrcmgr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 168\u001b[0;31m for page in PDFPage.get_pages(\n\u001b[0m\u001b[1;32m 169\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 170\u001b[0m \u001b[0mpage_numbers\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/pdfminer/pdfpage.py\u001b[0m in \u001b[0;36mget_pages\u001b[0;34m(cls, fp, pagenos, maxpages, password, caching, check_extractable)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPDFParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[0;31m# Create a PDF document object that stores the document structure.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 151\u001b[0;31m \u001b[0mdoc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPDFDocument\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparser\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpassword\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcaching\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcaching\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0;31m# Check if the document allows text extraction.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;31m# If not, warn the user and proceed.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/pdfminer/pdfdocument.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, parser, password, caching, fallback)\u001b[0m\n\u001b[1;32m 750\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 751\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 752\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mPDFSyntaxError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No /Root object! - Is this really a PDF?\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 753\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcatalog\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Type\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mLITERAL_CATALOG\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 754\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msettings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSTRICT\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mPDFSyntaxError\u001b[0m: No /Root object! - Is this really a PDF?" ] } ] }, { "cell_type": "code", "source": [ "os.listdir(folder)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pYvtWL1NfUSm", "outputId": "39adec19-bead-4768-a5a2-d5b929d4b5bc" }, "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['73 Dr Wendy Suzuki Boost Attention & Memory with ScienceBased Tools\\r\\n Huberman Lab Podcast 73.pdf',\n", " '66 Using Deliberate Cold Exposure for Health and Performance Huberman\\r\\n Lab Podcast 66.pdf',\n", " '79 Jeff Cavaliere Optimize Your Exercise Program with ScienceBased\\r\\n Tools Huberman Lab Podcast 79.txt',\n", " '41 Effects of Fasting & Time Restricted Eating on Fat Loss & Health\\r\\n Huberman Lab Podcast 41.pdf',\n", " '78 The Science & Treatment of Obsessive Compulsive Disorder (OCD)\\r\\n Huberman Lab Podcast 78.pdf',\n", " '40 Dr Craig Heller Using Temperature for Performance Brain & Body\\r\\n Health Huberman Lab Podcast 40.txt',\n", " '112 Dr Andy Galpin How to Build Physical Endurance & Lose Fat\\r\\n Huberman Lab Guest Series.pdf',\n", " '118 Dr Andy Galpin Optimal Nutrition & Supplementation for Fitness\\r\\n Huberman Lab Guest Series.txt',\n", " '11 How Foods and Nutrients Control Our Moods Huberman Lab Podcast\\r\\n 11.txt',\n", " '32 How to Control Your Sense of Pain & Pleasure Huberman Lab Podcast\\r\\n 32.txt',\n", " '69 The Science & Health Benefits of Deliberate Heat Exposure Huberman\\r\\n Lab Podcast 69.pdf',\n", " '66 Using Deliberate Cold Exposure for Health and Performance Huberman\\r\\n Lab Podcast 66.txt',\n", " '59 The Science of Love Desire and Attachment Huberman Lab Podcast\\r\\n 59.txt',\n", " 'OR LIVE EVENT Q&A Dr Andrew Huberman Question & Answer in Portland\\r\\n OR.pdf',\n", " '106 Developing a Rational Approach to Supplementation for Health &\\r\\n Performance Huberman Lab Podcast 106.pdf',\n", " '83 Dr Emily Balcetis Tools for Setting & Achieving Goals Huberman Lab\\r\\n Podcast 83.txt',\n", " '69 The Science & Health Benefits of Deliberate Heat Exposure Huberman\\r\\n Lab Podcast 69.txt',\n", " '06 How to Focus to Change Your Brain Huberman Lab Podcast 6.pdf',\n", " '11 How Foods and Nutrients Control Our Moods Huberman Lab Podcast\\r\\n 11.pdf',\n", " '115 Dr Gina Poe Use Sleep to Enhance Learning Memory & Emotional\\r\\n State Huberman Lab Podcast.txt',\n", " '24 The Science of Vision Eye Health & Seeing Better Huberman Lab\\r\\n Podcast 24.pdf',\n", " '80 Optimize & Control Your Brain Chemistry to Improve Health &\\r\\n Performance Huberman Lab Podcast 80.pdf',\n", " '80 Optimize & Control Your Brain Chemistry to Improve Health &\\r\\n Performance Huberman Lab Podcast 80.txt',\n", " '65 Dr Andy Galpin How to Build Strength Muscle Size & Endurance\\r\\n Huberman Lab Podcast 65.pdf',\n", " '67 Dr Kyle Gillett How to Optimize Your Hormones for Health &\\r\\n Vitality Huberman Lab Podcast 67.pdf',\n", " '12 How to Increase Motivation & Drive Huberman Lab Podcast 12.txt',\n", " '101 Using Caffeine to Optimize Mental & Physical Performance Huberman\\r\\n Lab Podcast 101.txt',\n", " '91 Dr Casey Halpern Biology & Treatments for Compulsive Eating &\\r\\n Behaviors Huberman Lab Podcast 91.pdf',\n", " 'OR LIVE EVENT Q&A Dr Andrew Huberman Question & Answer in Portland\\r\\n OR.txt',\n", " '21 How to Lose Fat with ScienceBased Tools Huberman Lab Podcast 21.txt',\n", " '75 Dr Paul Conti Therapy Treating Trauma & Other Life Challenges\\r\\n Huberman Lab Podcast 75.txt',\n", " '89 Dr David Anderson The Biology of Aggression Mating & Arousal\\r\\n Huberman Lab Podcast 89.pdf',\n", " '49 Erasing Fears & Traumas Based on the Modern Neuroscience of Fear\\r\\n Huberman Lab Podcast 49.pdf',\n", " '57 Optimizing Workspace for Productivity Focus & Creativity Huberman\\r\\n Lab Podcast 57.txt',\n", " '59 The Science of Love Desire and Attachment Huberman Lab Podcast\\r\\n 59.pdf',\n", " '75 Dr Paul Conti Therapy Treating Trauma & Other Life Challenges\\r\\n Huberman Lab Podcast 75.pdf',\n", " '19 Supercharge Exercise Performance & Recovery with Cooling Huberman\\r\\n Lab Podcast 19.pdf',\n", " '83 Dr Emily Balcetis Tools for Setting & Achieving Goals Huberman Lab\\r\\n Podcast 83.pdf',\n", " '48 Dr David Buss How Humans Select & Keep Romantic Partners in Short\\r\\n & Long Term Huberman Lab 48.txt',\n", " '07 Using Failures Movement & Balance to Learn Faster Huberman Lab\\r\\n Podcast 7.pdf',\n", " '32 How to Control Your Sense of Pain & Pleasure Huberman Lab Podcast\\r\\n 32.pdf',\n", " '93 Dr Nolan Williams Psychedelics & Neurostimulation for Brain\\r\\n Rewiring Huberman Lab Podcast 93.txt',\n", " '37 ADHD & How Anyone Can Improve Their Focus Huberman Lab Podcast\\r\\n 37.pdf',\n", " '84 Sleep Toolkit Tools for Optimizing Sleep & SleepWake Timing\\r\\n Huberman Lab Podcast 84.pdf',\n", " '19 Supercharge Exercise Performance & Recovery with Cooling Huberman\\r\\n Lab Podcast 19.txt',\n", " '08 Optimize Your Learning & Creativity with Sciencebased Tools\\r\\n Huberman Lab Podcast 8.txt',\n", " '23 How To Build Endurance In Your Brain & Body Huberman Lab Podcast\\r\\n 23.pdf',\n", " '72 Understand & Improve Memory Using ScienceBased Tools Huberman Lab\\r\\n Podcast 72.pdf',\n", " '65 Dr Andy Galpin How to Build Strength Muscle Size & Endurance\\r\\n Huberman Lab Podcast 65.txt',\n", " '40 Dr Craig Heller Using Temperature for Performance Brain & Body\\r\\n Health Huberman Lab Podcast 40.pdf',\n", " '48 Dr David Buss How Humans Select & Keep Romantic Partners in Short\\r\\n & Long Term Huberman Lab 48.pdf',\n", " '71 Understanding & Controlling Aggression Huberman Lab Podcast 71.txt',\n", " '108 Dr Andy Galpin How to Assess & Improve All Aspects of Your\\r\\n Fitness Huberman Lab Guest Series.pdf',\n", " '35 Dr Robert Sapolsky Science of Stress Testosterone & Free Will\\r\\n Huberman Lab Podcast 35.pdf',\n", " '12 How to Increase Motivation & Drive Huberman Lab Podcast 12.pdf',\n", " '54 Dr Jack Feldman Breathing for Mental & Physical Health &\\r\\n Performance Huberman Lab Podcast 54.pdf',\n", " '28 Maximizing Productivity Physical & Mental Health with Daily Tools\\r\\n Huberman Lab Podcast 28.txt',\n", " '104 Jocko Willink How to Become Resilient Forge Your Identity & Lead\\r\\n Others Huberman Lab Podcast 104.pdf',\n", " '87 Dr Erich Jarvis The Neuroscience of Speech Language & Music\\r\\n Huberman Lab Podcast 87.txt',\n", " '15 The Science of How to Optimize Testosterone & Estrogen Huberman\\r\\n Lab Podcast 15.pdf',\n", " '06 How to Focus to Change Your Brain Huberman Lab Podcast 6.txt',\n", " '98 ScienceBased Tools for Increasing Happiness Huberman Lab Podcast\\r\\n 98.txt',\n", " '71 Understanding & Controlling Aggression Huberman Lab Podcast 71.pdf',\n", " 'AMA 4 Maintain Motivation Improve REM Sleep Set Goals Manage Anxiety\\r\\n & More.pdf',\n", " '47 The Science of Gratitude & How to Build a Gratitude Practice\\r\\n Huberman Lab Podcast 47.pdf',\n", " '25 How Smell Taste & PheromoneLike Chemicals Control You Huberman Lab\\r\\n Podcast 25.txt',\n", " '87 Dr Erich Jarvis The Neuroscience of Speech Language & Music\\r\\n Huberman Lab Podcast 87.pdf',\n", " '18 Using Cortisol & Adrenaline to Boost Our Energy & Immune System\\r\\n Function Huberman Lab Podcast 18.pdf',\n", " '21 How to Lose Fat with ScienceBased Tools Huberman Lab Podcast 21.pdf',\n", " '31 Dr Matthew Walker The Science & Practice of Perfecting Your Sleep\\r\\n Huberman Lab Podcast 31.pdf',\n", " '113 How to Stop Headaches Using Science Based Approaches Huberman Lab\\r\\n Podcast.pdf',\n", " '113 How to Stop Headaches Using Science Based Approaches Huberman Lab\\r\\n Podcast.txt',\n", " '102 Dr Kyle Gillett Tools for Hormone Optimization in Males Huberman\\r\\n Lab Podcast 102.pdf',\n", " '37 ADHD & How Anyone Can Improve Their Focus Huberman Lab Podcast\\r\\n 37.txt',\n", " '57 Optimizing Workspace for Productivity Focus & Creativity Huberman\\r\\n Lab Podcast 57.pdf',\n", " '67 Dr Kyle Gillett How to Optimize Your Hormones for Health &\\r\\n Vitality Huberman Lab Podcast 67.txt',\n", " '24 The Science of Vision Eye Health & Seeing Better Huberman Lab\\r\\n Podcast 24.txt',\n", " '116 Dr Andy Galpin Maximize Recovery to Achieve Fitness & Performance\\r\\n Goals Huberman Lab.pdf',\n", " '41 Effects of Fasting & Time Restricted Eating on Fat Loss & Health\\r\\n Huberman Lab Podcast 41.txt',\n", " '17 How to Control Your Metabolism by Thyroid & Growth Hormone\\r\\n Huberman Lab Podcast 17.pdf',\n", " '70 Dr Rhonda Patrick Micronutrients for Health & Longevity Huberman\\r\\n Lab Podcast 70.txt',\n", " '104 Jocko Willink How to Become Resilient Forge Your Identity & Lead\\r\\n Others Huberman Lab Podcast 104.txt',\n", " '115 Dr Gina Poe Use Sleep to Enhance Learning Memory & Emotional\\r\\n State Huberman Lab Podcast.pdf',\n", " '116 Dr Andy Galpin Maximize Recovery to Achieve Fitness & Performance\\r\\n Goals Huberman Lab.txt',\n", " '23 How To Build Endurance In Your Brain & Body Huberman Lab Podcast\\r\\n 23.txt',\n", " '44 Using Your Nervous System to Enhance Your Immune System Huberman\\r\\n Lab Podcast 44.pdf',\n", " '17 How to Control Your Metabolism by Thyroid & Growth Hormone\\r\\n Huberman Lab Podcast 17.txt',\n", " 'AMA 4 Maintain Motivation Improve REM Sleep Set Goals Manage Anxiety\\r\\n & More.txt',\n", " '118 Dr Andy Galpin Optimal Nutrition & Supplementation for Fitness\\r\\n Huberman Lab Guest Series.pdf',\n", " '85 Dr Peter Attia Exercise Nutrition Hormones for Vitality &\\r\\n Longevity Huberman Lab Podcast 85.pdf',\n", " '63 Using Salt to Optimize Mental & Physical Performance Huberman Lab\\r\\n Podcast 63.txt',\n", " '91 Dr Casey Halpern Biology & Treatments for Compulsive Eating &\\r\\n Behaviors Huberman Lab Podcast 91.txt',\n", " '97 Dr Layne Norton The Science of Eating for Health Fat Loss & Lean\\r\\n Muscle Huberman Lab Podcast 97.pdf',\n", " '120 How to Optimize Your Water Quality & Intake for Health Huberman\\r\\n Lab Podcast.pdf',\n", " '46 Time Perception & Entrainment by Dopamine Serotonin & Hormones\\r\\n Huberman Lab Podcast 46.pdf',\n", " '93 Dr Nolan Williams Psychedelics & Neurostimulation for Brain\\r\\n Rewiring Huberman Lab Podcast 93.pdf',\n", " '102 Dr Kyle Gillett Tools for Hormone Optimization in Males Huberman\\r\\n Lab Podcast 102.txt',\n", " '25 How Smell Taste & PheromoneLike Chemicals Control You Huberman Lab\\r\\n Podcast 25.pdf',\n", " '31 Dr Matthew Walker The Science & Practice of Perfecting Your Sleep\\r\\n Huberman Lab Podcast 31.txt',\n", " '08 Optimize Your Learning & Creativity with Sciencebased Tools\\r\\n Huberman Lab Podcast 8.pdf',\n", " '70 Dr Rhonda Patrick Micronutrients for Health & Longevity Huberman\\r\\n Lab Podcast 70.pdf',\n", " '101 Using Caffeine to Optimize Mental & Physical Performance Huberman\\r\\n Lab Podcast 101.pdf',\n", " '63 Using Salt to Optimize Mental & Physical Performance Huberman Lab\\r\\n Podcast 63.pdf',\n", " '13 The Science of Emotions & Relationships Huberman Lab Podcast 13.pdf',\n", " '28 Maximizing Productivity Physical & Mental Health with Daily Tools\\r\\n Huberman Lab Podcast 28.pdf',\n", " '10 Tools for Managing Stress & Anxiety Huberman Lab Podcast 10.pdf',\n", " '15 The Science of How to Optimize Testosterone & Estrogen Huberman\\r\\n Lab Podcast 15.txt',\n", " '98 ScienceBased Tools for Increasing Happiness Huberman Lab Podcast\\r\\n 98.pdf',\n", " '79 Jeff Cavaliere Optimize Your Exercise Program with ScienceBased\\r\\n Tools Huberman Lab Podcast 79.pdf',\n", " '105 Dr Sam Harris Using Meditation to Focus View Consciousness &\\r\\n Expand Your Mind Huberman Lab 105.pdf',\n", " '44 Using Your Nervous System to Enhance Your Immune System Huberman\\r\\n Lab Podcast 44.txt',\n", " '85 Dr Peter Attia Exercise Nutrition Hormones for Vitality &\\r\\n Longevity Huberman Lab Podcast 85.txt',\n", " 'How%20to%20Optimize%20Your%20Water%20Quality%20&%20Intake%20for%20Health%20Huberman%20Lab%20Podcast.pdf',\n", " '33 Dr Anna Lembke Understanding & Treating Addiction Huberman Lab\\r\\n Podcast 33.pdf',\n", " '103 The Science of Creativity & How to Enhance Creative Innovation\\r\\n Huberman Lab Podcast 103.pdf',\n", " '88 Focus Toolkit Tools to Improve Your Focus & Concentration Huberman\\r\\n Lab Podcast 88.pdf',\n", " '95 Dr Eddie Chang The Science of Learning & Speaking Languages\\r\\n Huberman Lab Podcast 95.pdf',\n", " '100 Dr Lex Fridman Navigating Conflict Finding Purpose & Maintaining\\r\\n Drive Huberman Lab Podcast 100.pdf',\n", " '04 Find Your Temperature Minimum to Defeat Jetlag Shift Work &\\r\\n Sleeplessness Huberman Lab Podcast 4.pdf',\n", " '13 The Science of Emotions & Relationships Huberman Lab Podcast 13.txt',\n", " '81 Dr Charles Zuker The Biology of Taste Perception & Sugar Craving\\r\\n Huberman Lab Podcast 81.pdf',\n", " '74 The Science & Process of Healing from Grief Huberman Lab Podcast\\r\\n 74.pdf',\n", " '07 Using Failures Movement & Balance to Learn Faster Huberman Lab\\r\\n Podcast 7.txt',\n", " '97 Dr Layne Norton The Science of Eating for Health Fat Loss & Lean\\r\\n Muscle Huberman Lab Podcast 97.txt',\n", " '109 How to Optimize Fertility in Males & Females Huberman Lab\\r\\n Podcast.pdf',\n", " '18 Using Cortisol & Adrenaline to Boost Our Energy & Immune System\\r\\n Function Huberman Lab Podcast 18.txt',\n", " '56 Dr Alia Crum Science of Mindsets for Health & Performance Huberman\\r\\n Lab Podcast 56.pdf',\n", " '10 Tools for Managing Stress & Anxiety Huberman Lab Podcast 10.txt',\n", " '89 Dr David Anderson The Biology of Aggression Mating & Arousal\\r\\n Huberman Lab Podcast 89.txt',\n", " '73 Dr Wendy Suzuki Boost Attention & Memory with ScienceBased Tools\\r\\n Huberman Lab Podcast 73.txt',\n", " '117 How to Breathe Correctly for Optimal Health Mood Learning &\\r\\n Performance Huberman Lab Podcast.pdf',\n", " '52 Dr David Sinclair The Biology of Slowing & Reversing Aging\\r\\n Huberman Lab Podcast 52.pdf',\n", " '39 Controlling Your Dopamine For Motivation Focus & Satisfaction\\r\\n Huberman Lab Podcast 39.pdf',\n", " '62 Dr Justin Sonnenburg How to Build Maintain & Repair Gut Health\\r\\n Huberman Lab Podcast 62.pdf',\n", " '120 How to Optimize Your Water Quality & Intake for Health Huberman\\r\\n Lab Podcast.txt',\n", " '99 Dr Chris Palmer Diet & Nutrition for Mental Health Huberman Lab\\r\\n Podcast 99.pdf',\n", " '34 Understanding & Conquering Depression Huberman Lab Podcast 34.pdf',\n", " '58 Using Play to Rewire & Improve Your Brain Huberman Lab Podcast\\r\\n 58.pdf',\n", " '72 Understand & Improve Memory Using ScienceBased Tools Huberman Lab\\r\\n Podcast 72.txt',\n", " '53 The Science of Making & Breaking Habits Huberman Lab Podcast 53.pdf',\n", " '110 Dr Andy Galpin Optimal Protocols to Build Strength & Grow Muscles\\r\\n Huberman Lab Guest Series.pdf',\n", " '45 Dr Duncan French How to Exercise for Strength Gains & Hormone\\r\\n Optimization Huberman Lab 45.pdf',\n", " '47 The Science of Gratitude & How to Build a Gratitude Practice\\r\\n Huberman Lab Podcast 47.txt',\n", " '78 The Science & Treatment of Obsessive Compulsive Disorder (OCD)\\r\\n Huberman Lab Podcast 78.txt',\n", " '112 Dr Andy Galpin How to Build Physical Endurance & Lose Fat\\r\\n Huberman Lab Guest Series.txt',\n", " '90 Nicotine’s Effects on the Brain & Body & How to Quit Smoking or\\r\\n Vaping Huberman Lab Podcast 90.pdf',\n", " '105 Dr Sam Harris Using Meditation to Focus View Consciousness &\\r\\n Expand Your Mind Huberman Lab 105.txt',\n", " '22 Science of Muscle Growth Increasing Strength & Muscular Recovery\\r\\n Huberman Lab Podcast 22.pdf',\n", " '84 Sleep Toolkit Tools for Optimizing Sleep & SleepWake Timing\\r\\n Huberman Lab Podcast 84.txt',\n", " '46 Time Perception & Entrainment by Dopamine Serotonin & Hormones\\r\\n Huberman Lab Podcast 46.txt',\n", " '86 What Alcohol Does to Your Body Brain & Health Huberman Lab Podcast\\r\\n 86.pdf',\n", " '76 Improve Flexibility with ResearchSupported Stretching Protocols\\r\\n Huberman Lab Podcast 76.pdf',\n", " '27 The Science of Hearing Balance & Accelerated Learning Huberman Lab\\r\\n Podcast 27.pdf',\n", " '114 Dr Andy Galpin Optimize Your Training Program for Fitness &\\r\\n Longevity Huberman Lab Guest Series.pdf',\n", " '38 Dr Matthew Johnson Psychedelics for Treating Mental Disorders\\r\\n Huberman Lab Podcast 38.pdf',\n", " '64 Controlling Sugar Cravings & Metabolism with ScienceBased Tools\\r\\n Huberman Lab Podcast 64.pdf',\n", " '96 How Meditation Works & ScienceBased Effective Meditations Huberman\\r\\n Lab Podcast 96.pdf',\n", " '30 How to Optimize Your BrainBody Function & Health Huberman Lab\\r\\n Podcast 30.pdf',\n", " '111 Dr Sara Gottfried How to Optimize Female Hormone Health for\\r\\n Vitality & Longevity Huberman Lab.pdf',\n", " '43 Dr Samer Hattar Timing Light Food & Exercise for Better Sleep\\r\\n Energy & Mood Huberman Lab 43.pdf',\n", " 'WA LIVE EVENT Q&A Dr Andrew Huberman Question & Answer in Seattle\\r\\n WA.pdf',\n", " '09 Control Pain & Heal Faster with Your Brain Huberman Lab Podcast\\r\\n 9.pdf',\n", " '77 Ido Portal The Science & Practice of Movement Huberman Lab Podcast\\r\\n 77.pdf',\n", " '29 Dr Lex Fridman Machines Creativity & Love Huberman Lab Podcast\\r\\n 29.pdf',\n", " '20 How to Learn Skills Faster Huberman Lab Podcast 20.pdf',\n", " '60 Dr David Spiegel Using Hypnosis to Enhance Health & Performance\\r\\n Huberman Lab Podcast 60.pdf',\n", " '05 Understanding and Using Dreams to Learn and to Forget Huberman Lab\\r\\n Podcast 5.pdf',\n", " '92 The Effects of Cannabis (Marijuana) on the Brain & Body Huberman\\r\\n Lab Podcast 92.pdf',\n", " '106 Developing a Rational Approach to Supplementation for Health &\\r\\n Performance Huberman Lab Podcast 106.txt',\n", " '14 Biological Influences On Sex Sex Differences & Preferences\\r\\n Huberman Lab Podcast 14.pdf',\n", " '36 Healthy Eating & Eating Disorders Anorexia Bulimia Binging\\r\\n Huberman Lab Podcast 36.pdf',\n", " '55 The Science of Setting & Achieving Goals Huberman Lab Podcast\\r\\n 55.pdf',\n", " '82 The Science & Treatment of Bipolar Disorder Huberman Lab Podcast\\r\\n 82.pdf',\n", " '108 Dr Andy Galpin How to Assess & Improve All Aspects of Your\\r\\n Fitness Huberman Lab Guest Series.txt',\n", " '107 Rick Rubin How to Access Your Creativity Huberman Lab Podcast.pdf',\n", " '94 Fitness Toolkit Protocol & Tools to Optimize Physical Health\\r\\n Huberman Lab Podcast 94.pdf',\n", " '42 Nutrients For Brain Health & Performance Huberman Lab Podcast\\r\\n 42.pdf',\n", " '51 Science of Social Bonding in Family Friendship & Romantic Love\\r\\n Huberman Lab Podcast 51.pdf',\n", " '49 Erasing Fears & Traumas Based on the Modern Neuroscience of Fear\\r\\n Huberman Lab Podcast 49.txt',\n", " '61 How to Enhance Your Gut Microbiome for Brain & Overall Health\\r\\n Huberman Lab Podcast 61.pdf',\n", " '35 Dr Robert Sapolsky Science of Stress Testosterone & Free Will\\r\\n Huberman Lab Podcast 35.txt',\n", " '54 Dr Jack Feldman Breathing for Mental & Physical Health &\\r\\n Performance Huberman Lab Podcast 54.txt',\n", " \"50 Dr David Berson Your Brain's Logic & Function Huberman Lab Podcast\\r\\n 50.pdf\",\n", " '26 Dr Karl Deisseroth Understanding & Healing the Mind Huberman Lab\\r\\n Podcast 26.pdf',\n", " '68 Using Light (Sunlight Blue Light & Red Light) to Optimize Health\\r\\n Huberman Lab Podcast 68.pdf',\n", " '16 How Our Hormones Control Our Hunger Eating & Satiety Huberman Lab\\r\\n Podcast 16.pdf']" ] }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "os.getcwd()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "EpsTDwrOm8jo", "outputId": "aad30f8a-9184-42cb-ce6a-c80ea6fcad83" }, "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'/content'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "markdown", "source": [ "# cGPT tips om hur vi ska göra" ], "metadata": { "id": "p7_CNO0My2Z4" } }, { "cell_type": "markdown", "source": [ "To fine-tune a GPT-2 model on podcast transcripts, your proposed approach is mostly correct. Here are the steps you should follow:\n", "\n", "Preprocess the PDF transcripts: You need to extract the text from the PDF files and remove any irrelevant information such as headers and footers. You can use a library like PyPDF2 to extract the text from the PDF files.\n", "\n", "Convert the transcripts to a TextDataset: After extracting the text from the PDF files, you can save it in a text file and then use the TextDataset class from the Transformers library to create a dataset.\n", "\n", "Split the dataset into train and test sets: You can use the train_test_split function from the scikit-learn library to split the dataset into a training set and a validation set.\n", "\n", "Tokenize the dataset: You need to tokenize the text data to convert it into numerical data that the model can understand. You can use the Tokenizer class from the Transformers library to tokenize the text data.\n", "\n", "Fine-tune the GPT-2 model: You can use the Trainer class from the Transformers library to fine-tune the GPT-2 model on the podcast transcripts." ], "metadata": { "id": "huLfJNbWy-gS" } }, { "cell_type": "markdown", "source": [ "# Scraping the transcripts for PDF files" ], "metadata": { "id": "_fFt2XiioWqZ" } }, { "cell_type": "code", "source": [ "import requests\n", "\n", "url = \"https://readthatpodcast.com/\"\n", "response = requests.get(url)\n", "content = response.content" ], "metadata": { "id": "0pn8VJLCoX1n" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "from bs4 import BeautifulSoup\n", "\n", "soup = BeautifulSoup(content, \"html.parser\")\n", "pdf_links = []\n", "for link in soup.find_all(\"a\"):\n", " href = link.get(\"href\")\n", " if href is not None and href.endswith(\".pdf\"):\n", " pdf_links.append(href)" ], "metadata": { "id": "mFe05xVrolwz" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "\n", "base_url = \"https://readthatpodcast.com/\"\n", "for i, pdf_link in enumerate(pdf_links):\n", " if not pdf_link.startswith(\"http\"):\n", " pdf_link = base_url + pdf_link\n", " response = requests.get(pdf_link)\n", " filename = f\"transcription_{i}.pdf\"\n", " with open(filename, \"wb\") as f:\n", " f.write(response.content)\n", " print(f\"Downloaded {filename}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4HgWRCZUo-Hl", "outputId": "38d91d9d-7bb5-4b72-8deb-8df9ccfeb033" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloaded transcription_0.pdf\n", "Downloaded transcription_1.pdf\n", "Downloaded transcription_2.pdf\n", "Downloaded transcription_3.pdf\n", "Downloaded transcription_4.pdf\n", "Downloaded transcription_5.pdf\n", "Downloaded transcription_6.pdf\n", "Downloaded transcription_7.pdf\n", "Downloaded transcription_8.pdf\n", "Downloaded transcription_9.pdf\n", "Downloaded transcription_10.pdf\n", "Downloaded transcription_11.pdf\n", "Downloaded transcription_12.pdf\n", "Downloaded transcription_13.pdf\n", "Downloaded transcription_14.pdf\n", "Downloaded transcription_15.pdf\n", "Downloaded transcription_16.pdf\n", "Downloaded transcription_17.pdf\n", "Downloaded transcription_18.pdf\n", "Downloaded transcription_19.pdf\n", "Downloaded transcription_20.pdf\n", "Downloaded transcription_21.pdf\n", "Downloaded transcription_22.pdf\n", "Downloaded transcription_23.pdf\n", "Downloaded transcription_24.pdf\n", "Downloaded transcription_25.pdf\n", "Downloaded transcription_26.pdf\n", "Downloaded transcription_27.pdf\n", "Downloaded transcription_28.pdf\n", "Downloaded transcription_29.pdf\n", "Downloaded transcription_30.pdf\n", "Downloaded transcription_31.pdf\n", "Downloaded transcription_32.pdf\n", "Downloaded transcription_33.pdf\n", "Downloaded transcription_34.pdf\n", "Downloaded transcription_35.pdf\n", "Downloaded transcription_36.pdf\n", "Downloaded transcription_37.pdf\n", "Downloaded transcription_38.pdf\n", "Downloaded transcription_39.pdf\n", "Downloaded transcription_40.pdf\n", "Downloaded transcription_41.pdf\n", "Downloaded transcription_42.pdf\n", "Downloaded transcription_43.pdf\n", "Downloaded transcription_44.pdf\n", "Downloaded transcription_45.pdf\n", "Downloaded transcription_46.pdf\n", "Downloaded transcription_47.pdf\n", "Downloaded transcription_48.pdf\n", "Downloaded transcription_49.pdf\n", "Downloaded transcription_50.pdf\n", "Downloaded transcription_51.pdf\n", "Downloaded transcription_52.pdf\n", "Downloaded transcription_53.pdf\n", "Downloaded transcription_54.pdf\n", "Downloaded transcription_55.pdf\n", "Downloaded transcription_56.pdf\n", "Downloaded transcription_57.pdf\n", "Downloaded transcription_58.pdf\n", "Downloaded transcription_59.pdf\n", "Downloaded transcription_60.pdf\n", "Downloaded transcription_61.pdf\n", "Downloaded transcription_62.pdf\n", "Downloaded transcription_63.pdf\n", "Downloaded transcription_64.pdf\n", "Downloaded transcription_65.pdf\n", "Downloaded transcription_66.pdf\n", "Downloaded transcription_67.pdf\n", "Downloaded transcription_68.pdf\n", "Downloaded transcription_69.pdf\n", "Downloaded transcription_70.pdf\n", "Downloaded transcription_71.pdf\n", "Downloaded transcription_72.pdf\n", "Downloaded transcription_73.pdf\n", "Downloaded transcription_74.pdf\n", "Downloaded transcription_75.pdf\n", "Downloaded transcription_76.pdf\n", "Downloaded transcription_77.pdf\n", "Downloaded transcription_78.pdf\n", "Downloaded transcription_79.pdf\n", "Downloaded transcription_80.pdf\n", "Downloaded transcription_81.pdf\n", "Downloaded transcription_82.pdf\n", "Downloaded transcription_83.pdf\n", "Downloaded transcription_84.pdf\n", "Downloaded transcription_85.pdf\n", "Downloaded transcription_86.pdf\n", "Downloaded transcription_87.pdf\n", "Downloaded transcription_88.pdf\n", "Downloaded transcription_89.pdf\n", "Downloaded transcription_90.pdf\n", "Downloaded transcription_91.pdf\n", "Downloaded transcription_92.pdf\n", "Downloaded transcription_93.pdf\n", "Downloaded transcription_94.pdf\n", "Downloaded transcription_95.pdf\n", "Downloaded transcription_96.pdf\n", "Downloaded transcription_97.pdf\n", "Downloaded transcription_98.pdf\n", "Downloaded transcription_99.pdf\n", "Downloaded transcription_100.pdf\n", "Downloaded transcription_101.pdf\n", "Downloaded transcription_102.pdf\n", "Downloaded transcription_103.pdf\n", "Downloaded transcription_104.pdf\n", "Downloaded transcription_105.pdf\n", "Downloaded transcription_106.pdf\n", "Downloaded transcription_107.pdf\n", "Downloaded transcription_108.pdf\n", "Downloaded transcription_109.pdf\n", "Downloaded transcription_110.pdf\n", "Downloaded transcription_111.pdf\n", "Downloaded transcription_112.pdf\n", "Downloaded transcription_113.pdf\n", "Downloaded transcription_114.pdf\n", "Downloaded transcription_115.pdf\n", "Downloaded transcription_116.pdf\n", "Downloaded transcription_117.pdf\n", "Downloaded transcription_118.pdf\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Making all the PDF files into one .txt file " ], "metadata": { "id": "cD5uw6Fs101W" } }, { "cell_type": "code", "source": [ "!pip install PyPDF2" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kVFL45YixOC4", "outputId": "f48aa14e-b71b-41fc-c2a4-7b9179f8282f" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting PyPDF2\n", " Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 KB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: typing_extensions>=3.10.0.0 in /usr/local/lib/python3.9/dist-packages (from PyPDF2) (4.5.0)\n", "Installing collected packages: PyPDF2\n", "Successfully installed PyPDF2-3.0.1\n" ] } ] }, { "cell_type": "code", "source": [ "import PyPDF2\n", "import re\n", "\n", "text = ''\n", "for i in range(59):\n", " pdf_file = open(f'transcription_{i}.pdf', 'rb')\n", " pdf_reader = PyPDF2.PdfReader(pdf_file)\n", " for j in range(3, len(pdf_reader.pages)):\n", " page = pdf_reader.pages[j]\n", " text += page.extract_text()\n", "\n", "text = re.sub(r'\\n', ' ', text)\n", "\n", "with open('transcripts.txt', 'w') as f:\n", " f.write(text)\n" ], "metadata": { "id": "E8-xDAljxDoW" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "srunsTEgxwnw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "device = torch.device(\"cuda\")" ], "metadata": { "id": "EFT2a1p6D4cW" }, "execution_count": 6, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Now we can try to follow the german gpt-2 finetuning notebook" ], "metadata": { "id": "jYn5hrLM17Jc" } }, { "cell_type": "code", "source": [ "# Since we have a .txt file maybe we can skip the TextDataset part of german notebook? he goes from json -> txt with that method" ], "metadata": { "id": "WN9WjpdP1-Vt" }, "execution_count": 21, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install transformers==4.2.2" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "C7m-1lJa2Thh", "outputId": "3808bce6-9bba-40eb-b775-c2f71bc50e87" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting transformers==4.2.2\n", " Downloading transformers-4.2.2-py3-none-any.whl (1.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers==4.2.2) (4.65.0)\n", "Collecting tokenizers==0.9.4\n", " Downloading tokenizers-0.9.4-cp39-cp39-manylinux2010_x86_64.whl (2.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.9/2.9 MB\u001b[0m \u001b[31m67.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from transformers==4.2.2) (23.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from transformers==4.2.2) (1.22.4)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers==4.2.2) (3.9.0)\n", "Collecting sacremoses\n", " Downloading sacremoses-0.0.53.tar.gz (880 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m880.6/880.6 KB\u001b[0m \u001b[31m46.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers==4.2.2) (2.25.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers==4.2.2) (2022.6.2)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers==4.2.2) (2.10)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.9/dist-packages (from requests->transformers==4.2.2) (4.0.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers==4.2.2) (1.26.14)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers==4.2.2) (2022.12.7)\n", "Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from sacremoses->transformers==4.2.2) (1.15.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.9/dist-packages (from sacremoses->transformers==4.2.2) (8.1.3)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.9/dist-packages (from sacremoses->transformers==4.2.2) (1.2.0)\n", "Building wheels for collected packages: sacremoses\n", " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=77b1976a8c41808c190aa451e1546fbf2dea30de332f21e73e3b0f9926517d82\n", " Stored in directory: /root/.cache/pip/wheels/12/1c/3d/46cf06718d63a32ff798a89594b61e7f345ab6b36d909ce033\n", "Successfully built sacremoses\n", "Installing collected packages: tokenizers, sacremoses, transformers\n", "Successfully installed sacremoses-0.0.53 tokenizers-0.9.4 transformers-4.2.2\n" ] } ] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BYlQ3geC2WYN", "outputId": "9d2ca62e-ea2c-446b-eb27-bd1633093553" }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split" ], "metadata": { "id": "tmINEa-n2dD1" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "# Load the text data from file\n", "with open('transcripts.txt', 'r') as f:\n", " text = f.read()" ], "metadata": { "id": "x4zoL3pF36za" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "# Split the data into training and validation sets\n", "train_text, val_text = train_test_split(text, test_size=0.2, random_state=42)" ], "metadata": { "id": "Wq5LYR1138j3" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the lists to strings\n", "train_text_str = '\\n'.join(train_text)\n", "val_text_str = '\\n'.join(val_text)\n", "\n", "# Save the training and validation sets to file\n", "with open('train_text.txt', 'w') as f:\n", " f.write(train_text_str)\n", "\n", "with open('val_text.txt', 'w') as f:\n", " f.write(val_text_str)" ], "metadata": { "id": "HW0QsIBS4un2" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "print(\"Train dataset length: \"+str(len(train_text)))\n", "print(\"Test dataset length: \"+ str(len(val_text)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1sXbjvLX4xoV", "outputId": "bd5ac3e2-685e-4229-d9ec-dc4bc98e6dc0" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Train dataset length: 7278223\n", "Test dataset length: 1819556\n" ] } ] }, { "cell_type": "code", "source": [ "# Now we have our data, we need to try to tokenize it before we can train the model on it" ], "metadata": { "id": "QtVbdpAw45sc" }, "execution_count": 34, "outputs": [] }, { "cell_type": "code", "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\"gpt2\").to(device)\n", "\n", "train_path = 'train_text.txt'\n", "test_path = 'val_text.txt'" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 177, "referenced_widgets": [ "d27f2d2c41cd4be8a9aa9a6c2af13fb9", "11025864fbeb430a893b1de80faabcf9", "200671d2a7c247e0a01442eb125d5ce1", "435848b38f7044de8e83eb42f52e9229", "8fa288aef0ad48cb808283829b085412", "de073c24ab684b4398c1bf087e98c149", "4707bd9060944115890cffcb1c04c997", "671e4645df0d41cc9694017c080708a3", "765955da0dda49b29b716228a2b4ddf8", "b16a39477ca64800bf248a37f3dcfd2e", "cf59a8595c884352a8296abcf38f5156", "c0706b24a01e48b0adc722fd01152c53", "1c0ee2a664d64c2bb269b12b7e2a39f7", "9dc23485fb6b488da5e8f59db8512227", "6699d1dc6d034aa28ddf29dcc666ef3d", "1627eddbc0c741239b6e81734b6c1797", "429ef15df30d4e3e9096297d09a56317", "4f673f201aa84e8c9bf9daca8b3ca7e4", "a938011e05c04cb2a95e98461621aff8", "11ebe9e92fa547a6b90534524bf26216", "b7b4ad2048b0408b976e24bf43923c8a", "1602a2cf6cc841f4a186f4000cf322a2", "66f5232c274846718e559ccf1f51136a", "b43c9343b8b34d988a3f61eb35b4bcb4", "c7ff7c3e69f14dcfaa17a298dbc8f960", "43c09ba7df694bfe962f2bc97643e709", "9e3a41f9a95a4d2fba31ad6c59bbe39f", "00129ca9392d479389870f953ee21114", "55ff136de2e74e30b940d6bcead779b7", "78334aa6adbc4c8393b105901c476459", "af51e32471554a4e965fbbc570e9c464", "08321d2b90d645aa89c697ee96ddc17a", "387b5e274f304d38825e84b6a46e884e", "e8a4e1e0c7334684bfb36c6eefee8bb3", "96d393d7106b4dd08840c284998ae9f4", "4ff04b8ae6d2456ca0be2280454cb2c7", "298c5f7f1deb49309862093974301d16", "3cf6eb1885734b549eba98918802632a", "350463df10b74730bef0bfbd3c3ced79", "561e57826b0f442fb3d4def7ae5843ae", "aae93142cdf6434cbbcca9cb8a5111fe", "9bc27535b6ce4a509eaedd57430574ea", "cea7abb1d8544c57b769a4f4381a5069", "ce0cc3eb80014b909b8c96672184eca2", "56da3a98b0574eeb9ad67adfa4edad13", "7ada9dc05146420a9e195c1d766f612c", "04a73d69850e41db98de0cfdb37e5374", "f35f3043a6a34e549f25ceccc1153778", "5fc8f14e4d834eaba0e74728702c3c57", "5463b5b673004ea081304e058ec80f56", "9e71ef47c09f4cbb9999c4a50f5df962", "531137e7bbad40749e1f7e1a1d9d4528", "54ad9667bc8b49ae9c966f58fce35d5d", "5026e7aa6f2640b7bbc3218408de5b40", "f30b04de150841738b3d5de82c178d7d" ] }, "id": "IaOIjXPi50nI", "outputId": "1b07c182-9dc3-4b16-d6b6-e04748b814e5" }, "execution_count": 13, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Downloading: 0%| | 0.00/665 [00:00 1024). Running this sequence through the model will result in indexing errors\n" ] } ] }, { "cell_type": "code", "source": [ "from transformers import Trainer, TrainingArguments,AutoModelWithLMHead\n", "\n", "model = AutoModelWithLMHead.from_pretrained(\"gpt2\")\n", "\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./gpt2-hubpod\", #The output directory\n", " overwrite_output_dir=True, #overwrite the content of the output directory\n", " num_train_epochs=3, # number of training epochs\n", " per_device_train_batch_size=32, # batch size for training\n", " per_device_eval_batch_size=64, # batch size for evaluation\n", " eval_steps = 400, # Number of update steps between two evaluations.\n", " save_steps=8000, # after # steps model is saved \n", " warmup_steps=500,# number of warmup steps for learning rate scheduler\n", " prediction_loss_only=True,\n", " )\n", "\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " data_collator=data_collator,\n", " train_dataset=train_dataset,\n", " eval_dataset=test_dataset,\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f_6G0XqB7KL9", "outputId": "0cf7c23d-637e-4190-db7f-ee087a4fd9a1" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.9/dist-packages/transformers/models/auto/modeling_auto.py:921: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", " warnings.warn(\n" ] } ] }, { "cell_type": "code", "source": [ "trainer.train()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "K-JD_qbZOZxe", "outputId": "d16612d0-134f-4d94-8fb1-b4c161e28969" }, "execution_count": 16, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "
\n", " \n", " \n", " \n", " [28202/42654 2:24:20 < 1:13:58, 3.26 it/s, Epoch 1.98/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
5001.814700
10001.534100
15001.519500
20001.517000
25001.518200
30001.516700
35001.515800
40001.516900
45001.517800
50001.517800
55001.516400
60001.514400
65001.514600
70001.514800
75001.514100
80001.516200
85001.515800
90001.514500
95001.514600
100001.515200
105001.512100
110001.512200
115001.511100
120001.513200
125001.514100
130001.510500
135001.513800
140001.516300
145001.515000
150001.513900
155001.512400
160001.512700
165001.513800
170001.515400
175001.511300
180001.513200
185001.514900
190001.512200
195001.514100
200001.512500
205001.513100
210001.511400
215001.511100
220001.512600
225001.515400
230001.512200
235001.515700
240001.516100
245001.510600
250001.512900
255001.512800
260001.511700
265001.514600
270001.512100
275001.511200
280001.511800

" ] }, "metadata": {} }, { "output_type": "error", "ename": "KeyboardInterrupt", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, model_path, trial)\u001b[0m\n\u001b[1;32m 907\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 908\u001b[0m \u001b[0;31m# Revert to normal clipping otherwise, handling Apex or full precision\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 909\u001b[0;31m torch.nn.utils.clip_grad_norm_(\n\u001b[0m\u001b[1;32m 910\u001b[0m \u001b[0mamp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaster_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_apex\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 911\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_grad_norm\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.9/dist-packages/torch/nn/utils/clip_grad.py\u001b[0m in \u001b[0;36mclip_grad_norm_\u001b[0;34m(parameters, max_norm, norm_type, error_if_nonfinite)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0mclip_coef_clamped\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclamp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclip_coef\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgrads\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetach\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclip_coef_clamped\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtotal_norm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "d4W-MSdqOblu" }, "execution_count": null, "outputs": [] } ] }