{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NcbVnEu4FhOR", "outputId": "22b4eccf-e505-4e7d-cd62-a1f9a81c5122" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting fpdf\n", " Downloading fpdf-1.7.2.tar.gz (39 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Building wheels for collected packages: fpdf\n", " Building wheel for fpdf (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=fce1468b38f3575cc034bdaa92969086d872fc7d672c9d89a0021485d9dc4ea7\n", " Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455\n", "Successfully built fpdf\n", "Installing collected packages: fpdf\n", "Successfully installed fpdf-1.7.2\n", "Collecting PyPDF2\n", " Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: PyPDF2\n", "Successfully installed PyPDF2-3.0.1\n", "PDF generated at: /content/sample.pdf\n" ] } ], "source": [ "# First, install the necessary libraries\n", "!pip install fpdf\n", "!pip install PyPDF2\n", "\n", "from fpdf import FPDF\n", "\n", "# Create instance of FPDF class\n", "pdf = FPDF()\n", "\n", "# Add a page\n", "pdf.add_page()\n", "\n", "# Set title\n", "pdf.set_font(\"Arial\", size=12)\n", "pdf.multi_cell(0, 10, \"Introduction\\n\\nThis is the introduction section of the document. It provides an overview and background information about the topic.\\n\\n\")\n", "\n", "# Add a page\n", "pdf.add_page()\n", "pdf.multi_cell(0, 10, \"Methods\\n\\nThe methodology used in this study includes various techniques and approaches to gather and analyze data.\\n\\n\")\n", "\n", "# Add a page\n", "pdf.add_page()\n", "pdf.multi_cell(0, 10, \"Results\\n\\nThe results of the experiment are as follows. We observed significant changes in the sample group.\\n\\n\")\n", "\n", "# Add a page\n", "pdf.add_page()\n", "pdf.multi_cell(0, 10, \"Discussion\\n\\nIn this section, we discuss the implications of the results and analyze the data in detail.\\n\\n\")\n", "\n", "# Add a page\n", "pdf.add_page()\n", "pdf.multi_cell(0, 10, \"Conclusion\\n\\nTo conclude, the study shows that the proposed method is effective in achieving the desired outcomes.\\n\\n\")\n", "\n", "# Save the PDF\n", "pdf_file_path = \"/content/sample.pdf\"\n", "pdf.output(pdf_file_path)\n", "\n", "print(\"PDF generated at:\", pdf_file_path)\n" ] }, { "cell_type": "code", "source": [ "import PyPDF2\n", "import pandas as pd\n", "\n", "# Load the PDF\n", "pdf_file_path = \"/content/sample.pdf\"\n", "\n", "# Define the categories\n", "categories = [\"Introduction\", \"Methods\", \"Results\", \"Discussion\", \"Conclusion\"]\n", "\n", "# Create a dictionary to store the categorized text\n", "categorized_text = {category: [] for category in categories}\n", "\n", "# Function to categorize text based on keywords\n", "def categorize_text(text):\n", " for category in categories:\n", " if category.lower() in text.lower():\n", " return category\n", " return None\n", "\n", "# Read the PDF\n", "with open(pdf_file_path, 'rb') as file:\n", " reader = PyPDF2.PdfReader(file)\n", " num_pages = len(reader.pages)\n", " for page_num in range(num_pages):\n", " page = reader.pages[page_num]\n", " text = page.extract_text()\n", " category = categorize_text(text)\n", " if category:\n", " categorized_text[category].append(text.strip())\n", "\n", "# Prepare the data for CSV\n", "data = {\n", " \"Category\": [],\n", " \"Text\": []\n", "}\n", "\n", "for category, texts in categorized_text.items():\n", " for text in texts:\n", " data[\"Category\"].append(category)\n", " data[\"Text\"].append(text)\n", "\n", "# Convert to DataFrame\n", "df = pd.DataFrame(data)\n", "\n", "# Save to CSV\n", "output_csv_path = \"/content/categorized_text.csv\"\n", "df.to_csv(output_csv_path, index=False)\n", "\n", "print(\"Categorized text saved to:\", output_csv_path)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MptFl1iIFoPp", "outputId": "194b5e2d-9892-4662-a236-28f216e54389" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Categorized text saved to: /content/categorized_text.csv\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "HdQonYRGFsgq" }, "execution_count": null, "outputs": [] } ] }