{
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "!pip install transformers"
      ],
      "metadata": {
        "id": "IXN1_J6XaxjE"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "id": "Yrk5YRdocPxT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import pipeline"
      ],
      "metadata": {
        "id": "hVj_fy49cRdn"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import re\n",
        "import csv\n",
        "import nltk"
      ],
      "metadata": {
        "id": "lGei3TOqb17d"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Download the sentence tokenizer model\n",
        "nltk.download('punkt')"
      ],
      "metadata": {
        "id": "il7G8A6Lb15P"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!touch segmented-text.csv"
      ],
      "metadata": {
        "id": "b53mYmADb12-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "contract_file_path = \"/content/filename.txt\" #change with path to file to analyze\n",
        "output_csv_file = \"/content/segmented-text.csv\""
      ],
      "metadata": {
        "id": "W2Jvce15b10n"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def textsegmentation():\n",
        "    # Read the contract text from the file\n",
        "    with open(contract_file_path, 'r') as file:\n",
        "        contract_text = file.read()\n",
        "\n",
        "    # Tokenize the contract text into sentences\n",
        "    sentences = nltk.sent_tokenize(contract_text)\n",
        "\n",
        "    # Prepare data for CSV\n",
        "    data = [(i+1, sentence) for i, sentence in enumerate(sentences)]\n",
        "\n",
        "    # Write the data to CSV file\n",
        "    with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n",
        "        writer = csv.writer(file)\n",
        "        writer.writerow(['Sentence ID', 'Sentence Text'])  # Write header\n",
        "        writer.writerows(data)\n",
        "\n",
        "    print(\"Output saved to CSV file.\")"
      ],
      "metadata": {
        "id": "2-fUomgsb1yd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "textsegmentation()"
      ],
      "metadata": {
        "id": "0gYk3U3ob1vF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def csv_to_sentences(output_csv_file):\n",
        "    new_sentences = []\n",
        "\n",
        "    # Read the CSV file and extract sentences\n",
        "    with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n",
        "        csv_reader = csv.reader(file)\n",
        "        next(csv_reader)\n",
        "\n",
        "        for row in csv_reader:\n",
        "            if len(row) > 1:\n",
        "                sentence = str(row[1])\n",
        "                new_sentences.append(sentence)\n",
        "\n",
        "    return new_sentences\n",
        "\n",
        "# Convert the CSV file to a list of sentences\n",
        "sentences_list = csv_to_sentences(output_csv_file)"
      ],
      "metadata": {
        "id": "2HzwyD0Jb1os"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def few_shot_pe_llm_0():\n",
        "    pipe = pipeline(\"text-classification\", model=\"kolkata97/autotrain-pe-llm-0\")\n",
        "\n",
        "    predicted_categories = []\n",
        "\n",
        "    for sentence in sentences_list:\n",
        "        results = pipe(sentence)\n",
        "        predicted_category = results[0]['label']\n",
        "        predicted_categories.append(predicted_category)\n",
        "\n",
        "    # Append the predicted categories to the CSV file\n",
        "    with open(output_csv_file, 'r', newline='', encoding='utf-8') as file:\n",
        "        csv_reader = csv.reader(file)\n",
        "        rows = list(csv_reader)\n",
        "\n",
        "    # Add the predicted categories to each row\n",
        "    for i, row in enumerate(rows[1:], start=0):  # Skip the header row\n",
        "        row.append(predicted_categories[i])\n",
        "\n",
        "    # Write the updated data back to the CSV file\n",
        "    with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:\n",
        "        writer = csv.writer(file)\n",
        "        writer.writerows(rows)\n",
        "\n",
        "    print(\"Predicted categories appended to the CSV file.\")"
      ],
      "metadata": {
        "id": "etzKlbaybyaC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "few_shot_pe_llm_0()"
      ],
      "metadata": {
        "id": "mu1XkvXEbwit"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.13"
    },
    "orig_nbformat": 4,
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}