AI_text_detector_01

Running

File size: 6,471 Bytes

#Import and install libraries

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
import tensorflow_text as tf_text
import tensorflow_hub as hub
import gradio as gr


#Import model
model = tf.keras.models.load_model('detect_LLM_colab_method2')


#Function that predicts for a given text
def LLM_text(Text):
  prediction = model.predict([Text])
  output_text = 'Your text is {}% likely to be a LLM generated.'.format(round(prediction[0][0]*100, 2))

  return output_text



project_heading = '<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">ol{margin:0;padding:0}table td,table th{padding:0}.c1{-webkit-text-decoration-skip:none;color:#000000;font-weight:700;text-decoration:underline;vertical-align:baseline;text-decoration-skip-ink:none;font-size:26pt;font-family:"Arial";font-style:normal}.c2{padding-top:0pt;padding-bottom:3pt;line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:center}.c0{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c0 doc-content"><p class="c2 title" id="h.gqbolkoo61b7"><span class="c1">AI-generated Text Detection Engine</span></p></body></html>'

project_details_HTML = '''<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Project Summary</title>
  <style>
    body {
      font-family: Arial, sans-serif;
      line-height: 1.6;
      margin: 20px;
    }

    h2 {
      color: #1155cc;
    }

    p {
      margin-bottom: 10px;
    }

    ul {
      list-style-type: none;
      padding: 0;
    }
  </style>
</head>
<body>

<h2>Project Summary:</h2>
<p>
  This project aims to develop a model for distinguishing between human-generated and language model-generated text, with evaluation based on the area under the ROC curve. We utilised the BERT encoder for text embeddings and incorporated a custom architecture featuring a Multi-Head Attention layer. Challenges included handling overfitting the training dataset and handling of long text sequences. These obstacles were overcome through careful architectural design, troubleshooting, sequential processing of text input, and leveraging the pre-trained BERT embeddings. The model achieved promising results, demonstrating its effectiveness in discerning language model-generated text.
</p>

<p>
  For source code, please refer: <a href="https://github.com/yugamjayant/sept_2023_jh/blob/main/AI_text_detection_d06-transfer-learning-model-03-data-01.ipynb">https://github.com/yugamjayant/sept_2023_jh/blob/main/AI_text_detection_d06-transfer-learning-model-03-data-01.ipynb</a>
</p>

<h2>Data Set:</h2>
<p>
  The model was trained on, <a href="https://www.kaggle.com/datasets/carlmcbrideellis/llm-7-prompt-training-dataset">https://www.kaggle.com/datasets/carlmcbrideellis/llm-7-prompt-training-dataset</a>, which encompasses human and AI-generated essays on the following topics,
</p>

<ul>
  <li>"Car-free cities"</li>
  <li>"Does the electoral college work?"</li>
  <li>"Exploring Venus"</li>
  <li>"The Face on Mars"</li>
  <li>"Facial action coding system"</li>
  <li>"A Cowboy Who Rode the Waves"</li>
  <li>"Driverless cars"</li>
</ul>

<h2>Model Architecture:</h2>
<p>
  Used transfer learning to build a Neural Network on top of BERT, the model had 23,63,137 trainable parameters, where BERT's parameters were not trainable.
</p>

<p>
  Refer to the image below for more architectural details:
</p>

<figure>
  <img src="https://huggingface.co/spaces/yugamj/AI_text_detector_01/resolve/main/model.png" alt="AI text detection model Architecture" style="width: 444.74px; height: 664.50px; border: 2.67px solid #000000; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);">>
  <figcaption>Fig: AI text detection model Architecture</figcaption>
</figure>

</body>
</html>
'''

with gr.Blocks() as demo:
    gr.HTML(project_heading)
    with gr.Row():
        inp = gr.Textbox(label = 'Enter your text here!')
        out = gr.Textbox(label = 'Prediction')
    btn = gr.Button("Run")
    btn.click(fn=LLM_text, inputs=inp, outputs=out)
    with gr.Row():
        #gr.Markdown("Our project aimed to develop a model for distinguishing between human-generated and language model-generated text, with evaluation based on the area under the ROC curve. We utilized the BERT encoder for text embeddings and incorporated a custom architecture featuring a Multi-Head Attention layer. Challenges included handling input preprocessing, addressing model loading issues, and optimizing for ROC AUC on the test set. These obstacles were overcome through careful architectural design, troubleshooting, and leveraging pre-trained BERT embeddings. The model achieved promising results, demonstrating its effectiveness in discerning language model-generated text.")
        gr.HTML(project_details_HTML)
demo.launch()