Spaces:

rushi-k
/

App_7

Sleeping

App Files Files Community

rushi-k commited on Jun 22

Commit

093c950

•

1 Parent(s): f04b329

Upload 3 files

Browse files

Files changed (3) hide show

categorized_text.csv +14 -0
pdf_extraction.ipynb +172 -0
sample.pdf +132 -1

categorized_text.csv ADDED Viewed

	@@ -0,0 +1,14 @@

+Category,Text
+Introduction,"Introduction
+This is the introduction section of the document. It provides an overview and background information
+about the topic."
+Methods,"Methods
+The methodology used in this study includes various techniques and approaches to gather and
+analyze data."
+Results,"Results
+The results of the experiment are as follows. We observed significant changes in the sample group."
+Results,"Discussion
+In this section, we discuss the implications of the results and analyze the data in detail."
+Conclusion,"Conclusion
+To conclude, the study shows that the proposed method is effective in achieving the desired
+outcomes."

pdf_extraction.ipynb ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NcbVnEu4FhOR",
+        "outputId": "22b4eccf-e505-4e7d-cd62-a1f9a81c5122"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting fpdf\n",
+            "  Downloading fpdf-1.7.2.tar.gz (39 kB)\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Building wheels for collected packages: fpdf\n",
+            "  Building wheel for fpdf (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=fce1468b38f3575cc034bdaa92969086d872fc7d672c9d89a0021485d9dc4ea7\n",
+            "  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455\n",
+            "Successfully built fpdf\n",
+            "Installing collected packages: fpdf\n",
+            "Successfully installed fpdf-1.7.2\n",
+            "Collecting PyPDF2\n",
+            "  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: PyPDF2\n",
+            "Successfully installed PyPDF2-3.0.1\n",
+            "PDF generated at: /content/sample.pdf\n"
+          ]
+        }
+      ],
+      "source": [
+        "# First, install the necessary libraries\n",
+        "!pip install fpdf\n",
+        "!pip install PyPDF2\n",
+        "\n",
+        "from fpdf import FPDF\n",
+        "\n",
+        "# Create instance of FPDF class\n",
+        "pdf = FPDF()\n",
+        "\n",
+        "# Add a page\n",
+        "pdf.add_page()\n",
+        "\n",
+        "# Set title\n",
+        "pdf.set_font(\"Arial\", size=12)\n",
+        "pdf.multi_cell(0, 10, \"Introduction\\n\\nThis is the introduction section of the document. It provides an overview and background information about the topic.\\n\\n\")\n",
+        "\n",
+        "# Add a page\n",
+        "pdf.add_page()\n",
+        "pdf.multi_cell(0, 10, \"Methods\\n\\nThe methodology used in this study includes various techniques and approaches to gather and analyze data.\\n\\n\")\n",
+        "\n",
+        "# Add a page\n",
+        "pdf.add_page()\n",
+        "pdf.multi_cell(0, 10, \"Results\\n\\nThe results of the experiment are as follows. We observed significant changes in the sample group.\\n\\n\")\n",
+        "\n",
+        "# Add a page\n",
+        "pdf.add_page()\n",
+        "pdf.multi_cell(0, 10, \"Discussion\\n\\nIn this section, we discuss the implications of the results and analyze the data in detail.\\n\\n\")\n",
+        "\n",
+        "# Add a page\n",
+        "pdf.add_page()\n",
+        "pdf.multi_cell(0, 10, \"Conclusion\\n\\nTo conclude, the study shows that the proposed method is effective in achieving the desired outcomes.\\n\\n\")\n",
+        "\n",
+        "# Save the PDF\n",
+        "pdf_file_path = \"/content/sample.pdf\"\n",
+        "pdf.output(pdf_file_path)\n",
+        "\n",
+        "print(\"PDF generated at:\", pdf_file_path)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import PyPDF2\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Load the PDF\n",
+        "pdf_file_path = \"/content/sample.pdf\"\n",
+        "\n",
+        "# Define the categories\n",
+        "categories = [\"Introduction\", \"Methods\", \"Results\", \"Discussion\", \"Conclusion\"]\n",
+        "\n",
+        "# Create a dictionary to store the categorized text\n",
+        "categorized_text = {category: [] for category in categories}\n",
+        "\n",
+        "# Function to categorize text based on keywords\n",
+        "def categorize_text(text):\n",
+        "    for category in categories:\n",
+        "        if category.lower() in text.lower():\n",
+        "            return category\n",
+        "    return None\n",
+        "\n",
+        "# Read the PDF\n",
+        "with open(pdf_file_path, 'rb') as file:\n",
+        "    reader = PyPDF2.PdfReader(file)\n",
+        "    num_pages = len(reader.pages)\n",
+        "    for page_num in range(num_pages):\n",
+        "        page = reader.pages[page_num]\n",
+        "        text = page.extract_text()\n",
+        "        category = categorize_text(text)\n",
+        "        if category:\n",
+        "            categorized_text[category].append(text.strip())\n",
+        "\n",
+        "# Prepare the data for CSV\n",
+        "data = {\n",
+        "    \"Category\": [],\n",
+        "    \"Text\": []\n",
+        "}\n",
+        "\n",
+        "for category, texts in categorized_text.items():\n",
+        "    for text in texts:\n",
+        "        data[\"Category\"].append(category)\n",
+        "        data[\"Text\"].append(text)\n",
+        "\n",
+        "# Convert to DataFrame\n",
+        "df = pd.DataFrame(data)\n",
+        "\n",
+        "# Save to CSV\n",
+        "output_csv_path = \"/content/categorized_text.csv\"\n",
+        "df.to_csv(output_csv_path, index=False)\n",
+        "\n",
+        "print(\"Categorized text saved to:\", output_csv_path)\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "MptFl1iIFoPp",
+        "outputId": "194b5e2d-9892-4662-a236-28f216e54389"
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Categorized text saved to: /content/categorized_text.csv\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "HdQonYRGFsgq"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

sample.pdf ADDED Viewed

	@@ -0,0 +1,132 @@








































0	- ~~1\|�.�ƞЦ8��:�9~~

+%PDF-1.3
+3 0 obj
+<</Type /Page
+/Parent 1 0 R
+/Resources 2 0 R
+/Contents 4 0 R>>
+endobj
+4 0 obj
+<</Filter /FlateDecode /Length 172>>
+stream
+x�U��
+�0D�~���&�z��h�T��+i���i������>f
+�6��
+�椱�HȂ��nq��$+��HJA[l�}l�����s�$ٝ~��"�����cp��v�,�Թ>�����n@��хѻ)k��6�{����[]�W俄�ʥD�p�KL�7�>�hK�
+endstream
+endobj
+5 0 obj
+<</Type /Page
+/Parent 1 0 R
+/Resources 2 0 R
+/Contents 6 0 R>>
+endobj
+6 0 obj
+<</Filter /FlateDecode /Length 171>>
+stream
+x�U���0Ew�⎺��
+endstream
+endobj
+7 0 obj
+<</Type /Page
+/Parent 1 0 R
+/Resources 2 0 R
+/Contents 8 0 R>>
+endobj
+8 0 obj
+<</Filter /FlateDecode /Length 150>>
+stream
+x�Uͱ�0����Q��BH�j���i�\�/�@K��������._�r\2)J�%;�
+*R�8�̧*�PtU
+�a��s?���O�Jȍ���6��0��^#%?P�`�2\�������L�I
+endstream
+endobj
+9 0 obj
+<</Type /Page
+/Parent 1 0 R
+/Resources 2 0 R
+/Contents 10 0 R>>
+endobj
+10 0 obj
+<</Filter /FlateDecode /Length 144>>
+stream
+x�U�A
+�0D�=�,$&-%d+*�:M�_b*����7mq�b6o�05����T,�gU)a{��%��6���cs$�Ff���o5F�պ$�18t��;L~]�"��H��;��/�x���/q��	�.;P��Q��/`k7j
+endstream
+endobj
+11 0 obj
+<</Type /Page
+/Parent 1 0 R
+/Resources 2 0 R
+/Contents 12 0 R>>
+endobj
+12 0 obj
+<</Filter /FlateDecode /Length 169>>
+stream
+x�U�A�0�ὧxKMLmiHe�х�^����Ch�x{C��y�&��F��`ڜ,W�	)a+\�<i%TS��X��[�b�v����>�N?\B~�e�����&�4�b�S��eZƮ�#y4�j�TU�R	�E��@ch��CO~��������Cr�P�|A�D�
+endstream
+endobj
+1 0 obj
+<</Type /Pages
+/Kids [3 0 R 5 0 R 7 0 R 9 0 R 11 0 R ]
+/Count 5
+/MediaBox [0 0 595.28 841.89]
+>>
+endobj
+13 0 obj
+<</Type /Font
+/BaseFont /Helvetica
+/Subtype /Type1
+/Encoding /WinAnsiEncoding
+>>
+endobj
+2 0 obj
+<<
+/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
+/Font <<
+/F1 13 0 R
+>>
+/XObject <<
+>>
+>>
+endobj
+14 0 obj
+<<
+/Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)
+/CreationDate (D:20240607142530)
+>>
+endobj
+15 0 obj
+<<
+/Type /Catalog
+/Pages 1 0 R
+/OpenAction [3 0 R /FitH null]
+/PageLayout /OneColumn
+>>
+endobj
+xref
+0 16
+0000000000 65535 f
+0000001560 00000 n
+0000001769 00000 n
+0000000009 00000 n
+0000000087 00000 n
+0000000329 00000 n
+0000000407 00000 n
+0000000648 00000 n
+0000000726 00000 n
+0000000946 00000 n
+0000001025 00000 n
+0000001240 00000 n
+0000001320 00000 n
+0000001672 00000 n
+0000001874 00000 n
+0000001984 00000 n
+trailer
+<<
+/Size 16
+/Root 15 0 R
+/Info 14 0 R
+>>
+startxref
+2088
+%%EOF