rushi-k commited on
Commit
093c950
1 Parent(s): f04b329

Upload 3 files

Browse files
Files changed (3) hide show
  1. categorized_text.csv +14 -0
  2. pdf_extraction.ipynb +172 -0
  3. sample.pdf +132 -1
categorized_text.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Category,Text
2
+ Introduction,"Introduction
3
+ This is the introduction section of the document. It provides an overview and background information
4
+ about the topic."
5
+ Methods,"Methods
6
+ The methodology used in this study includes various techniques and approaches to gather and
7
+ analyze data."
8
+ Results,"Results
9
+ The results of the experiment are as follows. We observed significant changes in the sample group."
10
+ Results,"Discussion
11
+ In this section, we discuss the implications of the results and analyze the data in detail."
12
+ Conclusion,"Conclusion
13
+ To conclude, the study shows that the proposed method is effective in achieving the desired
14
+ outcomes."
pdf_extraction.ipynb ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/"
23
+ },
24
+ "id": "NcbVnEu4FhOR",
25
+ "outputId": "22b4eccf-e505-4e7d-cd62-a1f9a81c5122"
26
+ },
27
+ "outputs": [
28
+ {
29
+ "output_type": "stream",
30
+ "name": "stdout",
31
+ "text": [
32
+ "Collecting fpdf\n",
33
+ " Downloading fpdf-1.7.2.tar.gz (39 kB)\n",
34
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
35
+ "Building wheels for collected packages: fpdf\n",
36
+ " Building wheel for fpdf (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
37
+ " Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=fce1468b38f3575cc034bdaa92969086d872fc7d672c9d89a0021485d9dc4ea7\n",
38
+ " Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455\n",
39
+ "Successfully built fpdf\n",
40
+ "Installing collected packages: fpdf\n",
41
+ "Successfully installed fpdf-1.7.2\n",
42
+ "Collecting PyPDF2\n",
43
+ " Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
44
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
45
+ "\u001b[?25hInstalling collected packages: PyPDF2\n",
46
+ "Successfully installed PyPDF2-3.0.1\n",
47
+ "PDF generated at: /content/sample.pdf\n"
48
+ ]
49
+ }
50
+ ],
51
+ "source": [
52
+ "# First, install the necessary libraries\n",
53
+ "!pip install fpdf\n",
54
+ "!pip install PyPDF2\n",
55
+ "\n",
56
+ "from fpdf import FPDF\n",
57
+ "\n",
58
+ "# Create instance of FPDF class\n",
59
+ "pdf = FPDF()\n",
60
+ "\n",
61
+ "# Add a page\n",
62
+ "pdf.add_page()\n",
63
+ "\n",
64
+ "# Set title\n",
65
+ "pdf.set_font(\"Arial\", size=12)\n",
66
+ "pdf.multi_cell(0, 10, \"Introduction\\n\\nThis is the introduction section of the document. It provides an overview and background information about the topic.\\n\\n\")\n",
67
+ "\n",
68
+ "# Add a page\n",
69
+ "pdf.add_page()\n",
70
+ "pdf.multi_cell(0, 10, \"Methods\\n\\nThe methodology used in this study includes various techniques and approaches to gather and analyze data.\\n\\n\")\n",
71
+ "\n",
72
+ "# Add a page\n",
73
+ "pdf.add_page()\n",
74
+ "pdf.multi_cell(0, 10, \"Results\\n\\nThe results of the experiment are as follows. We observed significant changes in the sample group.\\n\\n\")\n",
75
+ "\n",
76
+ "# Add a page\n",
77
+ "pdf.add_page()\n",
78
+ "pdf.multi_cell(0, 10, \"Discussion\\n\\nIn this section, we discuss the implications of the results and analyze the data in detail.\\n\\n\")\n",
79
+ "\n",
80
+ "# Add a page\n",
81
+ "pdf.add_page()\n",
82
+ "pdf.multi_cell(0, 10, \"Conclusion\\n\\nTo conclude, the study shows that the proposed method is effective in achieving the desired outcomes.\\n\\n\")\n",
83
+ "\n",
84
+ "# Save the PDF\n",
85
+ "pdf_file_path = \"/content/sample.pdf\"\n",
86
+ "pdf.output(pdf_file_path)\n",
87
+ "\n",
88
+ "print(\"PDF generated at:\", pdf_file_path)\n"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "source": [
94
+ "import PyPDF2\n",
95
+ "import pandas as pd\n",
96
+ "\n",
97
+ "# Load the PDF\n",
98
+ "pdf_file_path = \"/content/sample.pdf\"\n",
99
+ "\n",
100
+ "# Define the categories\n",
101
+ "categories = [\"Introduction\", \"Methods\", \"Results\", \"Discussion\", \"Conclusion\"]\n",
102
+ "\n",
103
+ "# Create a dictionary to store the categorized text\n",
104
+ "categorized_text = {category: [] for category in categories}\n",
105
+ "\n",
106
+ "# Function to categorize text based on keywords\n",
107
+ "def categorize_text(text):\n",
108
+ " for category in categories:\n",
109
+ " if category.lower() in text.lower():\n",
110
+ " return category\n",
111
+ " return None\n",
112
+ "\n",
113
+ "# Read the PDF\n",
114
+ "with open(pdf_file_path, 'rb') as file:\n",
115
+ " reader = PyPDF2.PdfReader(file)\n",
116
+ " num_pages = len(reader.pages)\n",
117
+ " for page_num in range(num_pages):\n",
118
+ " page = reader.pages[page_num]\n",
119
+ " text = page.extract_text()\n",
120
+ " category = categorize_text(text)\n",
121
+ " if category:\n",
122
+ " categorized_text[category].append(text.strip())\n",
123
+ "\n",
124
+ "# Prepare the data for CSV\n",
125
+ "data = {\n",
126
+ " \"Category\": [],\n",
127
+ " \"Text\": []\n",
128
+ "}\n",
129
+ "\n",
130
+ "for category, texts in categorized_text.items():\n",
131
+ " for text in texts:\n",
132
+ " data[\"Category\"].append(category)\n",
133
+ " data[\"Text\"].append(text)\n",
134
+ "\n",
135
+ "# Convert to DataFrame\n",
136
+ "df = pd.DataFrame(data)\n",
137
+ "\n",
138
+ "# Save to CSV\n",
139
+ "output_csv_path = \"/content/categorized_text.csv\"\n",
140
+ "df.to_csv(output_csv_path, index=False)\n",
141
+ "\n",
142
+ "print(\"Categorized text saved to:\", output_csv_path)\n"
143
+ ],
144
+ "metadata": {
145
+ "colab": {
146
+ "base_uri": "https://localhost:8080/"
147
+ },
148
+ "id": "MptFl1iIFoPp",
149
+ "outputId": "194b5e2d-9892-4662-a236-28f216e54389"
150
+ },
151
+ "execution_count": 2,
152
+ "outputs": [
153
+ {
154
+ "output_type": "stream",
155
+ "name": "stdout",
156
+ "text": [
157
+ "Categorized text saved to: /content/categorized_text.csv\n"
158
+ ]
159
+ }
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "source": [],
165
+ "metadata": {
166
+ "id": "HdQonYRGFsgq"
167
+ },
168
+ "execution_count": null,
169
+ "outputs": []
170
+ }
171
+ ]
172
+ }
sample.pdf ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
- 1|�.�ƞЦ8����:�9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %PDF-1.3
2
+ 3 0 obj
3
+ <</Type /Page
4
+ /Parent 1 0 R
5
+ /Resources 2 0 R
6
+ /Contents 4 0 R>>
7
+ endobj
8
+ 4 0 obj
9
+ <</Filter /FlateDecode /Length 172>>
10
+ stream
11
+ x�U��
12
+ �0D�~���&�z��h�T��+i���i������>f
13
+ �6��
14
+ �椱�HȂ��nq��$+��HJA[l�} l�����s�$ٝ~��"�����cp��v�,�Թ>�����n@��хѻ)k��6�{����[]�W俄�ʥD�p�KL�7�>�hK�
15
+ endstream
16
+ endobj
17
+ 5 0 obj
18
+ <</Type /Page
19
+ /Parent 1 0 R
20
+ /Resources 2 0 R
21
+ /Contents 6 0 R>>
22
+ endobj
23
+ 6 0 obj
24
+ <</Filter /FlateDecode /Length 171>>
25
+ stream
26
+ x�U���0Ew�⎺��
27
+ endstream
28
+ endobj
29
+ 7 0 obj
30
+ <</Type /Page
31
+ /Parent 1 0 R
32
+ /Resources 2 0 R
33
+ /Contents 8 0 R>>
34
+ endobj
35
+ 8 0 obj
36
+ <</Filter /FlateDecode /Length 150>>
37
+ stream
38
+ x�Uͱ�0����Q��BH�j���i�\�/�@K��������._�r\2)J�%;�
39
+ *R�8�̧*�PtU
40
+ �a��s?���O�Jȍ���6��0��^#%?P�`�2\�������L�I
41
+ endstream
42
+ endobj
43
+ 9 0 obj
44
+ <</Type /Page
45
+ /Parent 1 0 R
46
+ /Resources 2 0 R
47
+ /Contents 10 0 R>>
48
+ endobj
49
+ 10 0 obj
50
+ <</Filter /FlateDecode /Length 144>>
51
+ stream
52
+ x�U�A
53
+ �0D�=�,$&-%d+*�:M�_b*����7mq�b6o�05����T,�gU )a{��%��6���cs$�Ff���o5F�պ$�18t��;L~]�"��H��;��/�x���/q�� �.;P��Q��/`k7j
54
+ endstream
55
+ endobj
56
+ 11 0 obj
57
+ <</Type /Page
58
+ /Parent 1 0 R
59
+ /Resources 2 0 R
60
+ /Contents 12 0 R>>
61
+ endobj
62
+ 12 0 obj
63
+ <</Filter /FlateDecode /Length 169>>
64
+ stream
65
+ x�U�A�0�ὧxKMLmiHe�х�^����Ch�x{C��y�&��F��`ڜ,W� )a+\�<i%TS��X��[�b�v����>�N?\B~�e�����&�4�b�S��eZƮ�#y4�j�TU�R �E��@ch��CO~��������Cr�P�|A�D�
66
+ endstream
67
+ endobj
68
+ 1 0 obj
69
+ <</Type /Pages
70
+ /Kids [3 0 R 5 0 R 7 0 R 9 0 R 11 0 R ]
71
+ /Count 5
72
+ /MediaBox [0 0 595.28 841.89]
73
+ >>
74
+ endobj
75
+ 13 0 obj
76
+ <</Type /Font
77
+ /BaseFont /Helvetica
78
+ /Subtype /Type1
79
+ /Encoding /WinAnsiEncoding
80
+ >>
81
+ endobj
82
+ 2 0 obj
83
+ <<
84
+ /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
85
+ /Font <<
86
+ /F1 13 0 R
87
+ >>
88
+ /XObject <<
89
+ >>
90
+ >>
91
+ endobj
92
+ 14 0 obj
93
+ <<
94
+ /Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)
95
+ /CreationDate (D:20240607142530)
96
+ >>
97
+ endobj
98
+ 15 0 obj
99
+ <<
100
+ /Type /Catalog
101
+ /Pages 1 0 R
102
+ /OpenAction [3 0 R /FitH null]
103
+ /PageLayout /OneColumn
104
+ >>
105
+ endobj
106
+ xref
107
+ 0 16
108
+ 0000000000 65535 f
109
+ 0000001560 00000 n
110
+ 0000001769 00000 n
111
+ 0000000009 00000 n
112
+ 0000000087 00000 n
113
+ 0000000329 00000 n
114
+ 0000000407 00000 n
115
+ 0000000648 00000 n
116
+ 0000000726 00000 n
117
+ 0000000946 00000 n
118
+ 0000001025 00000 n
119
+ 0000001240 00000 n
120
+ 0000001320 00000 n
121
+ 0000001672 00000 n
122
+ 0000001874 00000 n
123
+ 0000001984 00000 n
124
+ trailer
125
+ <<
126
+ /Size 16
127
+ /Root 15 0 R
128
+ /Info 14 0 R
129
+ >>
130
+ startxref
131
+ 2088
132
+ %%EOF