{ "cells": [ { "cell_type": "code", "source": [ "!pip install transformers" ], "metadata": { "id": "IXN1_J6XaxjE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "id": "Yrk5YRdocPxT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from transformers import pipeline" ], "metadata": { "id": "hVj_fy49cRdn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import re\n", "import csv\n", "import nltk" ], "metadata": { "id": "lGei3TOqb17d" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Download the sentence tokenizer model\n", "nltk.download('punkt')" ], "metadata": { "id": "il7G8A6Lb15P" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!touch segmented-text.csv" ], "metadata": { "id": "b53mYmADb12-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "contract_file_path = \"/content/filename.txt\" #change with path to file to analyze\n", "output_csv_file = \"/content/segmented-text.csv\"" ], "metadata": { "id": "W2Jvce15b10n" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def textsegmentation():\n", " # Read the contract text from the file\n", " with open(contract_file_path, 'r') as file:\n", " contract_text = file.read()\n", "\n", " # Tokenize the contract text into sentences\n", " sentences = nltk.sent_tokenize(contract_text)\n", "\n", " # Prepare data for CSV\n", " data = [(i+1, sentence) for i, sentence in enumerate(sentences)]\n", "\n", " # Write the data to CSV file\n", " with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n", " writer = csv.writer(file)\n", " writer.writerow(['Sentence ID', 'Sentence Text']) # Write header\n", " writer.writerows(data)\n", "\n", " print(\"Output saved to CSV file.\")" ], "metadata": { "id": "2-fUomgsb1yd" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "textsegmentation()" ], "metadata": { "id": "0gYk3U3ob1vF" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def csv_to_sentences(output_csv_file):\n", " new_sentences = []\n", "\n", " # Read the CSV file and extract sentences\n", " with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n", " csv_reader = csv.reader(file)\n", " next(csv_reader)\n", "\n", " for row in csv_reader:\n", " if len(row) > 1:\n", " sentence = str(row[1])\n", " new_sentences.append(sentence)\n", "\n", " return new_sentences\n", "\n", "# Convert the CSV file to a list of sentences\n", "sentences_list = csv_to_sentences(output_csv_file)" ], "metadata": { "id": "2HzwyD0Jb1os" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def few_shot_pe_llm_0():\n", " pipe = pipeline(\"text-classification\", model=\"kolkata97/autotrain-pe-llm-0\")\n", "\n", " predicted_categories = []\n", "\n", " for sentence in sentences_list:\n", " results = pipe(sentence)\n", " predicted_category = results[0]['label']\n", " predicted_categories.append(predicted_category)\n", "\n", " # Append the predicted categories to the CSV file\n", " with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n", " csv_reader = csv.reader(file)\n", " rows = list(csv_reader)\n", "\n", " # Add the predicted categories to each row\n", " for i, row in enumerate(rows[1:], start=0): # Skip the header row\n", " row.append(predicted_categories[i])\n", "\n", " # Write the updated data back to the CSV file\n", " with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n", " writer = csv.writer(file)\n", " writer.writerows(rows)\n", "\n", " print(\"Predicted categories appended to the CSV file.\")" ], "metadata": { "id": "etzKlbaybyaC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "few_shot_pe_llm_0()" ], "metadata": { "id": "mu1XkvXEbwit" }, "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 0 }