{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NcbVnEu4FhOR",
        "outputId": "22b4eccf-e505-4e7d-cd62-a1f9a81c5122"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting fpdf\n",
            "  Downloading fpdf-1.7.2.tar.gz (39 kB)\n",
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Building wheels for collected packages: fpdf\n",
            "  Building wheel for fpdf (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=fce1468b38f3575cc034bdaa92969086d872fc7d672c9d89a0021485d9dc4ea7\n",
            "  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455\n",
            "Successfully built fpdf\n",
            "Installing collected packages: fpdf\n",
            "Successfully installed fpdf-1.7.2\n",
            "Collecting PyPDF2\n",
            "  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: PyPDF2\n",
            "Successfully installed PyPDF2-3.0.1\n",
            "PDF generated at: /content/sample.pdf\n"
          ]
        }
      ],
      "source": [
        "# First, install the necessary libraries\n",
        "!pip install fpdf\n",
        "!pip install PyPDF2\n",
        "\n",
        "from fpdf import FPDF\n",
        "\n",
        "# Create instance of FPDF class\n",
        "pdf = FPDF()\n",
        "\n",
        "# Add a page\n",
        "pdf.add_page()\n",
        "\n",
        "# Set title\n",
        "pdf.set_font(\"Arial\", size=12)\n",
        "pdf.multi_cell(0, 10, \"Introduction\\n\\nThis is the introduction section of the document. It provides an overview and background information about the topic.\\n\\n\")\n",
        "\n",
        "# Add a page\n",
        "pdf.add_page()\n",
        "pdf.multi_cell(0, 10, \"Methods\\n\\nThe methodology used in this study includes various techniques and approaches to gather and analyze data.\\n\\n\")\n",
        "\n",
        "# Add a page\n",
        "pdf.add_page()\n",
        "pdf.multi_cell(0, 10, \"Results\\n\\nThe results of the experiment are as follows. We observed significant changes in the sample group.\\n\\n\")\n",
        "\n",
        "# Add a page\n",
        "pdf.add_page()\n",
        "pdf.multi_cell(0, 10, \"Discussion\\n\\nIn this section, we discuss the implications of the results and analyze the data in detail.\\n\\n\")\n",
        "\n",
        "# Add a page\n",
        "pdf.add_page()\n",
        "pdf.multi_cell(0, 10, \"Conclusion\\n\\nTo conclude, the study shows that the proposed method is effective in achieving the desired outcomes.\\n\\n\")\n",
        "\n",
        "# Save the PDF\n",
        "pdf_file_path = \"/content/sample.pdf\"\n",
        "pdf.output(pdf_file_path)\n",
        "\n",
        "print(\"PDF generated at:\", pdf_file_path)\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import PyPDF2\n",
        "import pandas as pd\n",
        "\n",
        "# Load the PDF\n",
        "pdf_file_path = \"/content/sample.pdf\"\n",
        "\n",
        "# Define the categories\n",
        "categories = [\"Introduction\", \"Methods\", \"Results\", \"Discussion\", \"Conclusion\"]\n",
        "\n",
        "# Create a dictionary to store the categorized text\n",
        "categorized_text = {category: [] for category in categories}\n",
        "\n",
        "# Function to categorize text based on keywords\n",
        "def categorize_text(text):\n",
        "    for category in categories:\n",
        "        if category.lower() in text.lower():\n",
        "            return category\n",
        "    return None\n",
        "\n",
        "# Read the PDF\n",
        "with open(pdf_file_path, 'rb') as file:\n",
        "    reader = PyPDF2.PdfReader(file)\n",
        "    num_pages = len(reader.pages)\n",
        "    for page_num in range(num_pages):\n",
        "        page = reader.pages[page_num]\n",
        "        text = page.extract_text()\n",
        "        category = categorize_text(text)\n",
        "        if category:\n",
        "            categorized_text[category].append(text.strip())\n",
        "\n",
        "# Prepare the data for CSV\n",
        "data = {\n",
        "    \"Category\": [],\n",
        "    \"Text\": []\n",
        "}\n",
        "\n",
        "for category, texts in categorized_text.items():\n",
        "    for text in texts:\n",
        "        data[\"Category\"].append(category)\n",
        "        data[\"Text\"].append(text)\n",
        "\n",
        "# Convert to DataFrame\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Save to CSV\n",
        "output_csv_path = \"/content/categorized_text.csv\"\n",
        "df.to_csv(output_csv_path, index=False)\n",
        "\n",
        "print(\"Categorized text saved to:\", output_csv_path)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MptFl1iIFoPp",
        "outputId": "194b5e2d-9892-4662-a236-28f216e54389"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Categorized text saved to: /content/categorized_text.csv\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "HdQonYRGFsgq"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}