kolkata97
/

pe-llm-0

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install transformers"
+      ],
+      "metadata": {
+        "id": "IXN1_J6XaxjE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ],
+      "metadata": {
+        "id": "Yrk5YRdocPxT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from transformers import pipeline"
+      ],
+      "metadata": {
+        "id": "hVj_fy49cRdn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import re\n",
+        "import csv\n",
+        "import nltk"
+      ],
+      "metadata": {
+        "id": "lGei3TOqb17d"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Download the sentence tokenizer model\n",
+        "nltk.download('punkt')"
+      ],
+      "metadata": {
+        "id": "il7G8A6Lb15P"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!touch segmented-text.csv"
+      ],
+      "metadata": {
+        "id": "b53mYmADb12-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "contract_file_path = \"/content/filename.txt\" #change with path to file to analyze\n",
+        "output_csv_file = \"/content/segmented-text.csv\""
+      ],
+      "metadata": {
+        "id": "W2Jvce15b10n"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def textsegmentation():\n",
+        "    # Read the contract text from the file\n",
+        "    with open(contract_file_path, 'r') as file:\n",
+        "        contract_text = file.read()\n",
+        "\n",
+        "    # Tokenize the contract text into sentences\n",
+        "    sentences = nltk.sent_tokenize(contract_text)\n",
+        "\n",
+        "    # Prepare data for CSV\n",
+        "    data = [(i+1, sentence) for i, sentence in enumerate(sentences)]\n",
+        "\n",
+        "    # Write the data to CSV file\n",
+        "    with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n",
+        "        writer = csv.writer(file)\n",
+        "        writer.writerow(['Sentence ID', 'Sentence Text'])  # Write header\n",
+        "        writer.writerows(data)\n",
+        "\n",
+        "    print(\"Output saved to CSV file.\")"
+      ],
+      "metadata": {
+        "id": "2-fUomgsb1yd"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "textsegmentation()"
+      ],
+      "metadata": {
+        "id": "0gYk3U3ob1vF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def csv_to_sentences(output_csv_file):\n",
+        "    new_sentences = []\n",
+        "\n",
+        "    # Read the CSV file and extract sentences\n",
+        "    with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n",
+        "        csv_reader = csv.reader(file)\n",
+        "        next(csv_reader)\n",
+        "\n",
+        "        for row in csv_reader:\n",
+        "            if len(row) > 1:\n",
+        "                sentence = str(row[1])\n",
+        "                new_sentences.append(sentence)\n",
+        "\n",
+        "    return new_sentences\n",
+        "\n",
+        "# Convert the CSV file to a list of sentences\n",
+        "sentences_list = csv_to_sentences(output_csv_file)"
+      ],
+      "metadata": {
+        "id": "2HzwyD0Jb1os"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def few_shot_pe_llm_0():\n",
+        "    pipe = pipeline(\"text-classification\", model=\"kolkata97/autotrain-pe-llm-0\")\n",
+        "\n",
+        "    predicted_categories = []\n",
+        "\n",
+        "    for sentence in sentences_list:\n",
+        "        results = pipe(sentence)\n",
+        "        predicted_category = results[0]['label']\n",
+        "        predicted_categories.append(predicted_category)\n",
+        "\n",
+        "    # Append the predicted categories to the CSV file\n",
+        "    with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n",
+        "        csv_reader = csv.reader(file)\n",
+        "        rows = list(csv_reader)\n",
+        "\n",
+        "    # Add the predicted categories to each row\n",
+        "    for i, row in enumerate(rows[1:], start=0):  # Skip the header row\n",
+        "        row.append(predicted_categories[i])\n",
+        "\n",
+        "    # Write the updated data back to the CSV file\n",
+        "    with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n",
+        "        writer = csv.writer(file)\n",
+        "        writer.writerows(rows)\n",
+        "\n",
+        "    print(\"Predicted categories appended to the CSV file.\")"
+      ],
+      "metadata": {
+        "id": "etzKlbaybyaC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "few_shot_pe_llm_0()"
+      ],
+      "metadata": {
+        "id": "mu1XkvXEbwit"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.13"
+    },
+    "orig_nbformat": 4,
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}