kolkata97 commited on
Commit
9a73f6b
1 Parent(s): 01fe2f1

Upload pipeline.ipynb

Browse files
Files changed (1) hide show
  1. pipeline.ipynb +223 -0
pipeline.ipynb ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "source": [
6
+ "!pip install transformers"
7
+ ],
8
+ "metadata": {
9
+ "id": "IXN1_J6XaxjE"
10
+ },
11
+ "execution_count": null,
12
+ "outputs": []
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "source": [
17
+ "from google.colab import drive\n",
18
+ "drive.mount('/content/drive')"
19
+ ],
20
+ "metadata": {
21
+ "id": "Yrk5YRdocPxT"
22
+ },
23
+ "execution_count": null,
24
+ "outputs": []
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "source": [
29
+ "from transformers import pipeline"
30
+ ],
31
+ "metadata": {
32
+ "id": "hVj_fy49cRdn"
33
+ },
34
+ "execution_count": null,
35
+ "outputs": []
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "source": [
40
+ "import re\n",
41
+ "import csv\n",
42
+ "import nltk"
43
+ ],
44
+ "metadata": {
45
+ "id": "lGei3TOqb17d"
46
+ },
47
+ "execution_count": null,
48
+ "outputs": []
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "source": [
53
+ "# Download the sentence tokenizer model\n",
54
+ "nltk.download('punkt')"
55
+ ],
56
+ "metadata": {
57
+ "id": "il7G8A6Lb15P"
58
+ },
59
+ "execution_count": null,
60
+ "outputs": []
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "source": [
65
+ "!touch segmented-text.csv"
66
+ ],
67
+ "metadata": {
68
+ "id": "b53mYmADb12-"
69
+ },
70
+ "execution_count": null,
71
+ "outputs": []
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "source": [
76
+ "contract_file_path = \"/content/filename.txt\" #change with path to file to analyze\n",
77
+ "output_csv_file = \"/content/segmented-text.csv\""
78
+ ],
79
+ "metadata": {
80
+ "id": "W2Jvce15b10n"
81
+ },
82
+ "execution_count": null,
83
+ "outputs": []
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "source": [
88
+ "def textsegmentation():\n",
89
+ " # Read the contract text from the file\n",
90
+ " with open(contract_file_path, 'r') as file:\n",
91
+ " contract_text = file.read()\n",
92
+ "\n",
93
+ " # Tokenize the contract text into sentences\n",
94
+ " sentences = nltk.sent_tokenize(contract_text)\n",
95
+ "\n",
96
+ " # Prepare data for CSV\n",
97
+ " data = [(i+1, sentence) for i, sentence in enumerate(sentences)]\n",
98
+ "\n",
99
+ " # Write the data to CSV file\n",
100
+ " with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n",
101
+ " writer = csv.writer(file)\n",
102
+ " writer.writerow(['Sentence ID', 'Sentence Text']) # Write header\n",
103
+ " writer.writerows(data)\n",
104
+ "\n",
105
+ " print(\"Output saved to CSV file.\")"
106
+ ],
107
+ "metadata": {
108
+ "id": "2-fUomgsb1yd"
109
+ },
110
+ "execution_count": null,
111
+ "outputs": []
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "source": [
116
+ "textsegmentation()"
117
+ ],
118
+ "metadata": {
119
+ "id": "0gYk3U3ob1vF"
120
+ },
121
+ "execution_count": null,
122
+ "outputs": []
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "source": [
127
+ "def csv_to_sentences(output_csv_file):\n",
128
+ " new_sentences = []\n",
129
+ "\n",
130
+ " # Read the CSV file and extract sentences\n",
131
+ " with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n",
132
+ " csv_reader = csv.reader(file)\n",
133
+ " next(csv_reader)\n",
134
+ "\n",
135
+ " for row in csv_reader:\n",
136
+ " if len(row) > 1:\n",
137
+ " sentence = str(row[1])\n",
138
+ " new_sentences.append(sentence)\n",
139
+ "\n",
140
+ " return new_sentences\n",
141
+ "\n",
142
+ "# Convert the CSV file to a list of sentences\n",
143
+ "sentences_list = csv_to_sentences(output_csv_file)"
144
+ ],
145
+ "metadata": {
146
+ "id": "2HzwyD0Jb1os"
147
+ },
148
+ "execution_count": null,
149
+ "outputs": []
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "source": [
154
+ "def few_shot_pe_llm_0():\n",
155
+ " pipe = pipeline(\"text-classification\", model=\"kolkata97/autotrain-pe-llm-0\")\n",
156
+ "\n",
157
+ " predicted_categories = []\n",
158
+ "\n",
159
+ " for sentence in sentences_list:\n",
160
+ " results = pipe(sentence)\n",
161
+ " predicted_category = results[0]['label']\n",
162
+ " predicted_categories.append(predicted_category)\n",
163
+ "\n",
164
+ " # Append the predicted categories to the CSV file\n",
165
+ " with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n",
166
+ " csv_reader = csv.reader(file)\n",
167
+ " rows = list(csv_reader)\n",
168
+ "\n",
169
+ " # Add the predicted categories to each row\n",
170
+ " for i, row in enumerate(rows[1:], start=0): # Skip the header row\n",
171
+ " row.append(predicted_categories[i])\n",
172
+ "\n",
173
+ " # Write the updated data back to the CSV file\n",
174
+ " with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n",
175
+ " writer = csv.writer(file)\n",
176
+ " writer.writerows(rows)\n",
177
+ "\n",
178
+ " print(\"Predicted categories appended to the CSV file.\")"
179
+ ],
180
+ "metadata": {
181
+ "id": "etzKlbaybyaC"
182
+ },
183
+ "execution_count": null,
184
+ "outputs": []
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "source": [
189
+ "few_shot_pe_llm_0()"
190
+ ],
191
+ "metadata": {
192
+ "id": "mu1XkvXEbwit"
193
+ },
194
+ "execution_count": null,
195
+ "outputs": []
196
+ }
197
+ ],
198
+ "metadata": {
199
+ "kernelspec": {
200
+ "display_name": "Python 3",
201
+ "language": "python",
202
+ "name": "python3"
203
+ },
204
+ "language_info": {
205
+ "codemirror_mode": {
206
+ "name": "ipython",
207
+ "version": 3
208
+ },
209
+ "file_extension": ".py",
210
+ "mimetype": "text/x-python",
211
+ "name": "python",
212
+ "nbconvert_exporter": "python",
213
+ "pygments_lexer": "ipython3",
214
+ "version": "3.9.13"
215
+ },
216
+ "orig_nbformat": 4,
217
+ "colab": {
218
+ "provenance": []
219
+ }
220
+ },
221
+ "nbformat": 4,
222
+ "nbformat_minor": 0
223
+ }