Spaces:
Running
Running
Upload 9 files
Browse files- README.md +4 -4
- app.py +55 -0
- dataTST.py +63 -0
- feature_ref_for_test.pt +3 -0
- logistic_regression_model.pkl +3 -0
- meta_train.py +131 -0
- net.pt +3 -0
- requirements.txt +6 -0
- utils_MMD.py +58 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title: MMD
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.45.1
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
+
title: MMD-MP Text Dection
|
3 |
+
emoji: 💻
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: gray
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.45.1
|
8 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from utils_MMD import MMD_batch2
|
3 |
+
from api_init import api_init
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
base_model, base_tokenizer, net, feature_ref, sigma, sigma0_u, ep, loaded_model, DEVICE = api_init()
|
7 |
+
|
8 |
+
def single_instance_detection(sentence=None):
|
9 |
+
|
10 |
+
with torch.no_grad():
|
11 |
+
if sentence is None:
|
12 |
+
sentence = "Now that you've built a demo, you'll probably want to share it with others. Gradio demos can be shared in two ways: using a temporary share link or permanent hosting on Spaces."
|
13 |
+
sentence = sentence.strip()
|
14 |
+
inputs = base_tokenizer([sentence], padding='max_length', truncation=True, max_length=100,return_tensors="pt").to(DEVICE)
|
15 |
+
outputs = base_model(**inputs)
|
16 |
+
hidden_states_all = outputs[2]
|
17 |
+
hidden_states = hidden_states_all[-1]
|
18 |
+
token_mask_10 = inputs['attention_mask'].unsqueeze(-1)
|
19 |
+
hidden_states_mask_10 = hidden_states * token_mask_10
|
20 |
+
|
21 |
+
feature_for_one_sample = net(hidden_states_mask_10)
|
22 |
+
mmd_feature_for_one_sample = MMD_batch2(torch.cat([feature_ref,feature_for_one_sample],dim=0), feature_ref.shape[0], 0, sigma, sigma0_u, ep,is_smooth=False).to('cpu')
|
23 |
+
|
24 |
+
y_pred_loaded = loaded_model.predict(mmd_feature_for_one_sample.detach().numpy().reshape(-1, 1))
|
25 |
+
print("y_pred_loaded:", y_pred_loaded)
|
26 |
+
|
27 |
+
prediction = int(y_pred_loaded[0])
|
28 |
+
if prediction == 0:
|
29 |
+
return "Human"
|
30 |
+
elif prediction == 1:
|
31 |
+
return "AI"
|
32 |
+
|
33 |
+
|
34 |
+
description = "Please input a sentence in English to classify whether it's generated by a human or AI.\n\nFor example: the texts below the box are generated by ChatGPT with a prompt: Write an essay in English titled by 'A Walk in the Woods'. You can directly click on the box or copy some texts into the input field for testing."
|
35 |
+
|
36 |
+
AI_examples = [
|
37 |
+
["The gentle rustling of leaves, the symphony of birdsong, and the dappled sunlight filtering through the towering trees create a serene tapestry of nature's beauty. A walk in the woods is a timeless and profound experience, one that invites us to disconnect from the hustle and bustle of modern life and reconnect with the natural world. It is a journey that nourishes the soul, rejuvenates the spirit, and reminds us of our deep-rooted connection to the earth."],
|
38 |
+
["The woods have always held a special place in human history and imagination. They are places of mystery and enchantment, where ancient trees bear witness to centuries of change and growth. The woods are where stories of knights and dragons, fairies and gnomes, have been woven into the fabric of our culture. They are places where poets have found inspiration, artists have discovered their muse, and philosophers have contemplated the mysteries of existence."],
|
39 |
+
["A walk in the woods offers a unique opportunity to escape the relentless pace of modern life. As we step onto the forest path, the cares and worries of the world seem to melt away. The woods have a way of grounding us, reminding us of our place in the grand scheme of things. In the presence of towering trees that have weathered countless storms and witnessed the passage of time, our own problems and concerns can feel insignificant, allowing us to gain a broader perspective on life."],
|
40 |
+
["The sensory delights of the woods are unparalleled. The air is rich with the earthy scent of leaves and moss, and the soft, mossy ground beneath our feet cradles each step. The chorus of birdsong provides a soothing soundtrack, while the play of light and shadow through the foliage creates a mesmerizing dance of colors and patterns."],
|
41 |
+
["But perhaps the most profound aspect of a walk in the woods is the opportunity for introspection and self-discovery. In the stillness of the forest, away from the distractions of screens and schedules, we have a chance to listen to the whispers of our own thoughts. It is a time for reflection, for sorting through the jumble of ideas and emotions that often crowd our minds."]
|
42 |
+
]
|
43 |
+
|
44 |
+
# Create the Gradio interface
|
45 |
+
iface = gr.Interface(
|
46 |
+
fn=single_instance_detection,
|
47 |
+
inputs="text",
|
48 |
+
outputs="text",
|
49 |
+
title="GPT-Inspector: Text generated by Human or AI?",
|
50 |
+
description=description,
|
51 |
+
examples=AI_examples,
|
52 |
+
)
|
53 |
+
|
54 |
+
# Launch the Gradio interface
|
55 |
+
iface.launch()
|
dataTST.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import random
|
4 |
+
from meta_train import mmdPreModel
|
5 |
+
from collections import namedtuple
|
6 |
+
import joblib
|
7 |
+
from transformers import RobertaTokenizer, RobertaModel
|
8 |
+
|
9 |
+
|
10 |
+
def api_init():
|
11 |
+
|
12 |
+
random.seed(0)
|
13 |
+
np.random.seed(0)
|
14 |
+
torch.manual_seed(0)
|
15 |
+
torch.cuda.manual_seed(0)
|
16 |
+
torch.cuda.manual_seed_all(0)
|
17 |
+
torch.backends.cudnn.benchmark = False
|
18 |
+
torch.backends.cudnn.deterministic = True
|
19 |
+
|
20 |
+
model_name = 'roberta-base-openai-detector'
|
21 |
+
model_path_api = f'.'
|
22 |
+
token_num, hidden_size = 100, 768
|
23 |
+
|
24 |
+
Config = namedtuple('Config', ['in_dim', 'hid_dim', 'dropout', 'out_dim', 'token_num'])
|
25 |
+
config = Config(
|
26 |
+
in_dim=hidden_size,
|
27 |
+
token_num=token_num,
|
28 |
+
hid_dim=512,
|
29 |
+
dropout=0.2,
|
30 |
+
out_dim=300,)
|
31 |
+
|
32 |
+
net = mmdPreModel(config=config, num_mlp=0, transformer_flag=True, num_hidden_layers=1)
|
33 |
+
|
34 |
+
# load the features and models
|
35 |
+
feature_ref_for_test_filename = f'{model_path_api}/feature_ref_for_test.pt'
|
36 |
+
model_filename = f'{model_path_api}/logistic_regression_model.pkl'
|
37 |
+
net_filename = f'{model_path_api}/net.pt'
|
38 |
+
|
39 |
+
load_ref_data = torch.load(feature_ref_for_test_filename,map_location=torch.device('cpu')) # cpu
|
40 |
+
loaded_model = joblib.load(model_filename) # cpu
|
41 |
+
checkpoint = torch.load(net_filename,map_location=torch.device('cpu'))
|
42 |
+
net.load_state_dict(checkpoint['net'])
|
43 |
+
sigma, sigma0_u, ep = checkpoint['sigma'], checkpoint['sigma0_u'], checkpoint['ep']
|
44 |
+
|
45 |
+
# generic generative model
|
46 |
+
cache_dir = ".cache"
|
47 |
+
base_tokenizer = RobertaTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
48 |
+
base_model = RobertaModel.from_pretrained(model_name, output_hidden_states=True, cache_dir=cache_dir)
|
49 |
+
|
50 |
+
# whether load the model to gpu
|
51 |
+
gpu_using = False
|
52 |
+
|
53 |
+
DEVICE = torch.device("cpu")
|
54 |
+
if gpu_using:
|
55 |
+
DEVICE = torch.device("cuda:0")
|
56 |
+
net = net.to(DEVICE)
|
57 |
+
sigma, sigma0_u, ep = sigma.to(DEVICE), sigma0_u.to(DEVICE), ep.to(DEVICE)
|
58 |
+
load_ref_data = load_ref_data.to(DEVICE)
|
59 |
+
base_model = base_model.to(DEVICE)
|
60 |
+
num_ref = 5000
|
61 |
+
feature_ref = load_ref_data[np.random.permutation(load_ref_data.shape[0])][:num_ref].to(DEVICE)
|
62 |
+
|
63 |
+
return base_model, base_tokenizer, net, feature_ref, sigma, sigma0_u, ep, loaded_model, DEVICE
|
feature_ref_for_test.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f690f4244c0b245823f847a75d1fd25a151ad96075744156e628c0c1dda65302
|
3 |
+
size 42240786
|
logistic_regression_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce9afa92d745bff3bbfb4ae028db369cb35710a9099159a12ac1048fed3760e6
|
3 |
+
size 756
|
meta_train.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
import math
|
4 |
+
|
5 |
+
|
6 |
+
from pytorch_transformers.modeling_bert import(
|
7 |
+
BertEncoder,
|
8 |
+
BertPreTrainedModel,
|
9 |
+
BertConfig
|
10 |
+
)
|
11 |
+
|
12 |
+
class GeLU(nn.Module):
|
13 |
+
"""Implementation of the gelu activation function.
|
14 |
+
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
15 |
+
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
16 |
+
Also see https://arxiv.org/abs/1606.08415
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self):
|
20 |
+
super().__init__()
|
21 |
+
|
22 |
+
def forward(self, x):
|
23 |
+
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
24 |
+
|
25 |
+
class BertLayerNorm(nn.Module):
|
26 |
+
def __init__(self, hidden_size, eps=1e-12):
|
27 |
+
"""Construct a layernorm module in the TF style (epsilon inside the square root).
|
28 |
+
"""
|
29 |
+
super(BertLayerNorm, self).__init__()
|
30 |
+
self.weight = nn.Parameter(torch.ones(hidden_size))
|
31 |
+
self.bias = nn.Parameter(torch.zeros(hidden_size))
|
32 |
+
self.variance_epsilon = eps
|
33 |
+
|
34 |
+
def forward(self, x):
|
35 |
+
u = x.mean(-1, keepdim=True)
|
36 |
+
s = (x - u).pow(2).mean(-1, keepdim=True)
|
37 |
+
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
|
38 |
+
return self.weight * x + self.bias
|
39 |
+
|
40 |
+
class mlp_meta(nn.Module):
|
41 |
+
def __init__(self, config):
|
42 |
+
super().__init__()
|
43 |
+
self.mlp = nn.Sequential(
|
44 |
+
nn.Linear(config.hid_dim, config.hid_dim),
|
45 |
+
GeLU(),
|
46 |
+
BertLayerNorm(config.hid_dim, eps=1e-12),
|
47 |
+
nn.Dropout(config.dropout),
|
48 |
+
)
|
49 |
+
|
50 |
+
def forward(self, x):
|
51 |
+
return self.mlp(x)
|
52 |
+
|
53 |
+
class Bert_Transformer_Layer(BertPreTrainedModel):
|
54 |
+
def __init__(self,fusion_config):
|
55 |
+
super().__init__(BertConfig(**fusion_config))
|
56 |
+
bertconfig_fusion = BertConfig(**fusion_config)
|
57 |
+
self.encoder = BertEncoder(bertconfig_fusion)
|
58 |
+
self.init_weights()
|
59 |
+
|
60 |
+
def forward(self,input, mask=None):
|
61 |
+
"""
|
62 |
+
input:(bs, 4, dim)
|
63 |
+
"""
|
64 |
+
batch, feats, dim = input.size()
|
65 |
+
if mask is not None:
|
66 |
+
mask_ = torch.ones(size=(batch,feats), device=mask.device)
|
67 |
+
mask_[:,1:] = mask
|
68 |
+
mask_ = torch.bmm(mask_.view(batch,1,-1).transpose(1,2), mask_.view(batch,1,-1))
|
69 |
+
mask_ = mask_.unsqueeze(1)
|
70 |
+
|
71 |
+
else:
|
72 |
+
mask = torch.Tensor([1.0]).to(input.device)
|
73 |
+
mask_ = mask.repeat(batch,1,feats, feats)
|
74 |
+
|
75 |
+
extend_mask = (1- mask_) * -10000
|
76 |
+
assert not extend_mask.requires_grad
|
77 |
+
head_mask = [None] * self.config.num_hidden_layers
|
78 |
+
|
79 |
+
enc_output = self.encoder(
|
80 |
+
input,extend_mask,head_mask=head_mask
|
81 |
+
)
|
82 |
+
output = enc_output[0]
|
83 |
+
all_attention = enc_output[1]
|
84 |
+
|
85 |
+
return output,all_attention
|
86 |
+
|
87 |
+
class mmdPreModel(nn.Module):
|
88 |
+
def __init__(self, config, num_mlp=0, transformer_flag=False, num_hidden_layers=1, mlp_flag=True):
|
89 |
+
super(mmdPreModel, self).__init__()
|
90 |
+
self.num_mlp = num_mlp
|
91 |
+
self.transformer_flag = transformer_flag
|
92 |
+
self.mlp_flag = mlp_flag
|
93 |
+
token_num = config.token_num
|
94 |
+
self.mlp = nn.Sequential(
|
95 |
+
nn.Linear(config.in_dim, config.hid_dim),
|
96 |
+
GeLU(),
|
97 |
+
BertLayerNorm(config.hid_dim, eps=1e-12),
|
98 |
+
nn.Dropout(config.dropout),
|
99 |
+
# nn.Linear(config.hid_dim, config.out_dim),
|
100 |
+
)
|
101 |
+
self.fusion_config = {
|
102 |
+
'hidden_size': config.in_dim,
|
103 |
+
'num_hidden_layers':num_hidden_layers,
|
104 |
+
'num_attention_heads':4,
|
105 |
+
'output_attentions':True
|
106 |
+
}
|
107 |
+
if self.num_mlp>0:
|
108 |
+
self.mlp2 = nn.ModuleList([mlp_meta(config) for _ in range(self.num_mlp)])
|
109 |
+
if self.transformer_flag:
|
110 |
+
self.transformer = Bert_Transformer_Layer(self.fusion_config)
|
111 |
+
self.feature = nn.Linear(config.hid_dim * token_num, config.out_dim)
|
112 |
+
|
113 |
+
def forward(self, features):
|
114 |
+
"""
|
115 |
+
input: [batch, token_num, hidden_size], output: [batch, token_num * config.out_dim]
|
116 |
+
"""
|
117 |
+
|
118 |
+
if self.transformer_flag:
|
119 |
+
features,_ = self.transformer(features)
|
120 |
+
if self.mlp_flag:
|
121 |
+
features = self.mlp(features)
|
122 |
+
|
123 |
+
if self.num_mlp>0:
|
124 |
+
# features = self.mlp2(features)
|
125 |
+
for _ in range(1):
|
126 |
+
for mlp in self.mlp2:
|
127 |
+
features = mlp(features)
|
128 |
+
|
129 |
+
features = self.feature(features.view(features.shape[0], -1))
|
130 |
+
return features #features.view(features.shape[0], -1)
|
131 |
+
|
net.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51eddf445c758f5d42e8f4783b1e8ecc5833db5a9bc170203478d73a761ddd6a
|
3 |
+
size 91379438
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
numpy
|
4 |
+
joblib
|
5 |
+
pytorch_transformers
|
6 |
+
scikit-learn
|
utils_MMD.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
def Pdist2(x, y):
|
4 |
+
"""compute the paired distance between x and y."""
|
5 |
+
x_norm = (x ** 2).sum(1).view(-1, 1)
|
6 |
+
if y is not None:
|
7 |
+
y_norm = (y ** 2).sum(1).view(1, -1)
|
8 |
+
else:
|
9 |
+
y = x
|
10 |
+
y_norm = x_norm.view(1, -1)
|
11 |
+
Pdist = x_norm + y_norm - 2.0 * torch.mm(x, torch.transpose(y, 0, 1))
|
12 |
+
Pdist[Pdist<0]=0
|
13 |
+
return Pdist
|
14 |
+
|
15 |
+
def MMD_batch2(Fea, len_s, Fea_org, sigma, sigma0=0.1, epsilon = 10**(-10), is_smooth=True, is_var_computed=True, use_1sample_U=True, coeff_xy=2):
|
16 |
+
X = Fea[0:len_s, :]
|
17 |
+
Y = Fea[len_s:, :]
|
18 |
+
if is_smooth:
|
19 |
+
X_org = Fea_org[0:len_s, :]
|
20 |
+
Y_org = Fea_org[len_s:, :]
|
21 |
+
L = 1 # generalized Gaussian (if L>1)
|
22 |
+
|
23 |
+
nx = X.shape[0]
|
24 |
+
ny = Y.shape[0]
|
25 |
+
Dxx = Pdist2(X, X)
|
26 |
+
Dyy = torch.zeros(Fea.shape[0] - len_s, 1).to(Dxx.device)
|
27 |
+
# Dyy = Pdist2(Y, Y)
|
28 |
+
Dxy = Pdist2(X, Y).transpose(0,1)
|
29 |
+
if is_smooth:
|
30 |
+
Dxx_org = Pdist2(X_org, X_org)
|
31 |
+
Dyy_org = torch.zeros(Fea.shape[0] - len_s, 1).to(Dxx.device)
|
32 |
+
# Dyy_org = Pdist2(Y_org, Y_org) # 1,1 0
|
33 |
+
Dxy_org = Pdist2(X_org, Y_org).transpose(0,1)
|
34 |
+
|
35 |
+
|
36 |
+
if is_smooth:
|
37 |
+
Kx = (1-epsilon) * torch.exp(-(Dxx / sigma0)**L -Dxx_org / sigma) + epsilon * torch.exp(-Dxx_org / sigma)
|
38 |
+
Ky = (1-epsilon) * torch.exp(-(Dyy / sigma0)**L -Dyy_org / sigma) + epsilon * torch.exp(-Dyy_org / sigma)
|
39 |
+
Kxy = (1-epsilon) * torch.exp(-(Dxy / sigma0)**L -Dxy_org / sigma) + epsilon * torch.exp(-Dxy_org / sigma)
|
40 |
+
else:
|
41 |
+
Kx = torch.exp(-Dxx / sigma0)
|
42 |
+
Ky = torch.exp(-Dyy / sigma0)
|
43 |
+
Kxy = torch.exp(-Dxy / sigma0)
|
44 |
+
|
45 |
+
nx = Kx.shape[0]
|
46 |
+
|
47 |
+
is_unbiased = False
|
48 |
+
if 1:
|
49 |
+
xx = torch.div((torch.sum(Kx)), (nx * nx))
|
50 |
+
yy = Ky.reshape(-1)
|
51 |
+
|
52 |
+
# one-sample U-statistic.
|
53 |
+
|
54 |
+
xy = torch.div(torch.sum(Kxy, dim = 1), (nx ))
|
55 |
+
|
56 |
+
mmd2 = xx - 2 * xy + yy
|
57 |
+
return mmd2
|
58 |
+
|