|
import pandas as pd |
|
import chromadb |
|
from sklearn.model_selection import train_test_split |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline |
|
import gradio as gr |
|
import email |
|
|
|
|
|
emails = pd.read_csv('emails.csv') |
|
|
|
def preprocess_email_content(raw_email): |
|
message = email.message_from_string(raw_email).get_payload() |
|
return message.replace("\n", "").replace("\r", "").replace("> >>> > >", "").strip() |
|
|
|
content_text = [preprocess_email_content(item) for item in emails['message']] |
|
train_content, _ = train_test_split(content_text, train_size=0.00005) |
|
|
|
|
|
client = chromadb.Client() |
|
collection = client.create_collection(name="Enron_emails") |
|
collection.add(documents=train_content, ids=[f'id{i+1}' for i in range(len(train_content))]) |
|
|
|
|
|
tokenizer = None |
|
model = None |
|
text_gen = None |
|
|
|
def load_model(): |
|
global tokenizer, model, text_gen |
|
if model is None or tokenizer is None: |
|
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model') |
|
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model') |
|
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer) |
|
|
|
def question_answer(question): |
|
load_model() |
|
try: |
|
generated = text_gen(question, max_length=200, num_return_sequences=1) |
|
generated_text = generated[0]['generated_text'].replace(question, "").strip() |
|
return generated_text |
|
except Exception as e: |
|
return f"Error in generating response: {str(e)}" |
|
|
|
iface = gr.Interface( |
|
fn=question_answer, |
|
inputs="text", |
|
outputs="text", |
|
title="Answering questions about the Enron case.", |
|
description="Ask a question about the Enron case!", |
|
examples=["What is Enron?"] |
|
) |
|
iface.launch() |