alwayse commited on
Commit
d0e1f8b
1 Parent(s): de43646

Upload 9 files

Browse files
Files changed (9) hide show
  1. README.md +4 -4
  2. app.py +55 -0
  3. dataTST.py +63 -0
  4. feature_ref_for_test.pt +3 -0
  5. logistic_regression_model.pkl +3 -0
  6. meta_train.py +131 -0
  7. net.pt +3 -0
  8. requirements.txt +6 -0
  9. utils_MMD.py +58 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: MMD MP Text Dection
3
- emoji: 🌖
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.45.1
8
  app_file: app.py
 
1
  ---
2
+ title: MMD-MP Text Dection
3
+ emoji: 💻
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 3.45.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from utils_MMD import MMD_batch2
3
+ from api_init import api_init
4
+ import gradio as gr
5
+
6
+ base_model, base_tokenizer, net, feature_ref, sigma, sigma0_u, ep, loaded_model, DEVICE = api_init()
7
+
8
+ def single_instance_detection(sentence=None):
9
+
10
+ with torch.no_grad():
11
+ if sentence is None:
12
+ sentence = "Now that you've built a demo, you'll probably want to share it with others. Gradio demos can be shared in two ways: using a temporary share link or permanent hosting on Spaces."
13
+ sentence = sentence.strip()
14
+ inputs = base_tokenizer([sentence], padding='max_length', truncation=True, max_length=100,return_tensors="pt").to(DEVICE)
15
+ outputs = base_model(**inputs)
16
+ hidden_states_all = outputs[2]
17
+ hidden_states = hidden_states_all[-1]
18
+ token_mask_10 = inputs['attention_mask'].unsqueeze(-1)
19
+ hidden_states_mask_10 = hidden_states * token_mask_10
20
+
21
+ feature_for_one_sample = net(hidden_states_mask_10)
22
+ mmd_feature_for_one_sample = MMD_batch2(torch.cat([feature_ref,feature_for_one_sample],dim=0), feature_ref.shape[0], 0, sigma, sigma0_u, ep,is_smooth=False).to('cpu')
23
+
24
+ y_pred_loaded = loaded_model.predict(mmd_feature_for_one_sample.detach().numpy().reshape(-1, 1))
25
+ print("y_pred_loaded:", y_pred_loaded)
26
+
27
+ prediction = int(y_pred_loaded[0])
28
+ if prediction == 0:
29
+ return "Human"
30
+ elif prediction == 1:
31
+ return "AI"
32
+
33
+
34
+ description = "Please input a sentence in English to classify whether it's generated by a human or AI.\n\nFor example: the texts below the box are generated by ChatGPT with a prompt: Write an essay in English titled by 'A Walk in the Woods'. You can directly click on the box or copy some texts into the input field for testing."
35
+
36
+ AI_examples = [
37
+ ["The gentle rustling of leaves, the symphony of birdsong, and the dappled sunlight filtering through the towering trees create a serene tapestry of nature's beauty. A walk in the woods is a timeless and profound experience, one that invites us to disconnect from the hustle and bustle of modern life and reconnect with the natural world. It is a journey that nourishes the soul, rejuvenates the spirit, and reminds us of our deep-rooted connection to the earth."],
38
+ ["The woods have always held a special place in human history and imagination. They are places of mystery and enchantment, where ancient trees bear witness to centuries of change and growth. The woods are where stories of knights and dragons, fairies and gnomes, have been woven into the fabric of our culture. They are places where poets have found inspiration, artists have discovered their muse, and philosophers have contemplated the mysteries of existence."],
39
+ ["A walk in the woods offers a unique opportunity to escape the relentless pace of modern life. As we step onto the forest path, the cares and worries of the world seem to melt away. The woods have a way of grounding us, reminding us of our place in the grand scheme of things. In the presence of towering trees that have weathered countless storms and witnessed the passage of time, our own problems and concerns can feel insignificant, allowing us to gain a broader perspective on life."],
40
+ ["The sensory delights of the woods are unparalleled. The air is rich with the earthy scent of leaves and moss, and the soft, mossy ground beneath our feet cradles each step. The chorus of birdsong provides a soothing soundtrack, while the play of light and shadow through the foliage creates a mesmerizing dance of colors and patterns."],
41
+ ["But perhaps the most profound aspect of a walk in the woods is the opportunity for introspection and self-discovery. In the stillness of the forest, away from the distractions of screens and schedules, we have a chance to listen to the whispers of our own thoughts. It is a time for reflection, for sorting through the jumble of ideas and emotions that often crowd our minds."]
42
+ ]
43
+
44
+ # Create the Gradio interface
45
+ iface = gr.Interface(
46
+ fn=single_instance_detection,
47
+ inputs="text",
48
+ outputs="text",
49
+ title="GPT-Inspector: Text generated by Human or AI?",
50
+ description=description,
51
+ examples=AI_examples,
52
+ )
53
+
54
+ # Launch the Gradio interface
55
+ iface.launch()
dataTST.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import random
4
+ from meta_train import mmdPreModel
5
+ from collections import namedtuple
6
+ import joblib
7
+ from transformers import RobertaTokenizer, RobertaModel
8
+
9
+
10
+ def api_init():
11
+
12
+ random.seed(0)
13
+ np.random.seed(0)
14
+ torch.manual_seed(0)
15
+ torch.cuda.manual_seed(0)
16
+ torch.cuda.manual_seed_all(0)
17
+ torch.backends.cudnn.benchmark = False
18
+ torch.backends.cudnn.deterministic = True
19
+
20
+ model_name = 'roberta-base-openai-detector'
21
+ model_path_api = f'.'
22
+ token_num, hidden_size = 100, 768
23
+
24
+ Config = namedtuple('Config', ['in_dim', 'hid_dim', 'dropout', 'out_dim', 'token_num'])
25
+ config = Config(
26
+ in_dim=hidden_size,
27
+ token_num=token_num,
28
+ hid_dim=512,
29
+ dropout=0.2,
30
+ out_dim=300,)
31
+
32
+ net = mmdPreModel(config=config, num_mlp=0, transformer_flag=True, num_hidden_layers=1)
33
+
34
+ # load the features and models
35
+ feature_ref_for_test_filename = f'{model_path_api}/feature_ref_for_test.pt'
36
+ model_filename = f'{model_path_api}/logistic_regression_model.pkl'
37
+ net_filename = f'{model_path_api}/net.pt'
38
+
39
+ load_ref_data = torch.load(feature_ref_for_test_filename,map_location=torch.device('cpu')) # cpu
40
+ loaded_model = joblib.load(model_filename) # cpu
41
+ checkpoint = torch.load(net_filename,map_location=torch.device('cpu'))
42
+ net.load_state_dict(checkpoint['net'])
43
+ sigma, sigma0_u, ep = checkpoint['sigma'], checkpoint['sigma0_u'], checkpoint['ep']
44
+
45
+ # generic generative model
46
+ cache_dir = ".cache"
47
+ base_tokenizer = RobertaTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
48
+ base_model = RobertaModel.from_pretrained(model_name, output_hidden_states=True, cache_dir=cache_dir)
49
+
50
+ # whether load the model to gpu
51
+ gpu_using = False
52
+
53
+ DEVICE = torch.device("cpu")
54
+ if gpu_using:
55
+ DEVICE = torch.device("cuda:0")
56
+ net = net.to(DEVICE)
57
+ sigma, sigma0_u, ep = sigma.to(DEVICE), sigma0_u.to(DEVICE), ep.to(DEVICE)
58
+ load_ref_data = load_ref_data.to(DEVICE)
59
+ base_model = base_model.to(DEVICE)
60
+ num_ref = 5000
61
+ feature_ref = load_ref_data[np.random.permutation(load_ref_data.shape[0])][:num_ref].to(DEVICE)
62
+
63
+ return base_model, base_tokenizer, net, feature_ref, sigma, sigma0_u, ep, loaded_model, DEVICE
feature_ref_for_test.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f690f4244c0b245823f847a75d1fd25a151ad96075744156e628c0c1dda65302
3
+ size 42240786
logistic_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce9afa92d745bff3bbfb4ae028db369cb35710a9099159a12ac1048fed3760e6
3
+ size 756
meta_train.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import math
4
+
5
+
6
+ from pytorch_transformers.modeling_bert import(
7
+ BertEncoder,
8
+ BertPreTrainedModel,
9
+ BertConfig
10
+ )
11
+
12
+ class GeLU(nn.Module):
13
+ """Implementation of the gelu activation function.
14
+ For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
15
+ 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
16
+ Also see https://arxiv.org/abs/1606.08415
17
+ """
18
+
19
+ def __init__(self):
20
+ super().__init__()
21
+
22
+ def forward(self, x):
23
+ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
24
+
25
+ class BertLayerNorm(nn.Module):
26
+ def __init__(self, hidden_size, eps=1e-12):
27
+ """Construct a layernorm module in the TF style (epsilon inside the square root).
28
+ """
29
+ super(BertLayerNorm, self).__init__()
30
+ self.weight = nn.Parameter(torch.ones(hidden_size))
31
+ self.bias = nn.Parameter(torch.zeros(hidden_size))
32
+ self.variance_epsilon = eps
33
+
34
+ def forward(self, x):
35
+ u = x.mean(-1, keepdim=True)
36
+ s = (x - u).pow(2).mean(-1, keepdim=True)
37
+ x = (x - u) / torch.sqrt(s + self.variance_epsilon)
38
+ return self.weight * x + self.bias
39
+
40
+ class mlp_meta(nn.Module):
41
+ def __init__(self, config):
42
+ super().__init__()
43
+ self.mlp = nn.Sequential(
44
+ nn.Linear(config.hid_dim, config.hid_dim),
45
+ GeLU(),
46
+ BertLayerNorm(config.hid_dim, eps=1e-12),
47
+ nn.Dropout(config.dropout),
48
+ )
49
+
50
+ def forward(self, x):
51
+ return self.mlp(x)
52
+
53
+ class Bert_Transformer_Layer(BertPreTrainedModel):
54
+ def __init__(self,fusion_config):
55
+ super().__init__(BertConfig(**fusion_config))
56
+ bertconfig_fusion = BertConfig(**fusion_config)
57
+ self.encoder = BertEncoder(bertconfig_fusion)
58
+ self.init_weights()
59
+
60
+ def forward(self,input, mask=None):
61
+ """
62
+ input:(bs, 4, dim)
63
+ """
64
+ batch, feats, dim = input.size()
65
+ if mask is not None:
66
+ mask_ = torch.ones(size=(batch,feats), device=mask.device)
67
+ mask_[:,1:] = mask
68
+ mask_ = torch.bmm(mask_.view(batch,1,-1).transpose(1,2), mask_.view(batch,1,-1))
69
+ mask_ = mask_.unsqueeze(1)
70
+
71
+ else:
72
+ mask = torch.Tensor([1.0]).to(input.device)
73
+ mask_ = mask.repeat(batch,1,feats, feats)
74
+
75
+ extend_mask = (1- mask_) * -10000
76
+ assert not extend_mask.requires_grad
77
+ head_mask = [None] * self.config.num_hidden_layers
78
+
79
+ enc_output = self.encoder(
80
+ input,extend_mask,head_mask=head_mask
81
+ )
82
+ output = enc_output[0]
83
+ all_attention = enc_output[1]
84
+
85
+ return output,all_attention
86
+
87
+ class mmdPreModel(nn.Module):
88
+ def __init__(self, config, num_mlp=0, transformer_flag=False, num_hidden_layers=1, mlp_flag=True):
89
+ super(mmdPreModel, self).__init__()
90
+ self.num_mlp = num_mlp
91
+ self.transformer_flag = transformer_flag
92
+ self.mlp_flag = mlp_flag
93
+ token_num = config.token_num
94
+ self.mlp = nn.Sequential(
95
+ nn.Linear(config.in_dim, config.hid_dim),
96
+ GeLU(),
97
+ BertLayerNorm(config.hid_dim, eps=1e-12),
98
+ nn.Dropout(config.dropout),
99
+ # nn.Linear(config.hid_dim, config.out_dim),
100
+ )
101
+ self.fusion_config = {
102
+ 'hidden_size': config.in_dim,
103
+ 'num_hidden_layers':num_hidden_layers,
104
+ 'num_attention_heads':4,
105
+ 'output_attentions':True
106
+ }
107
+ if self.num_mlp>0:
108
+ self.mlp2 = nn.ModuleList([mlp_meta(config) for _ in range(self.num_mlp)])
109
+ if self.transformer_flag:
110
+ self.transformer = Bert_Transformer_Layer(self.fusion_config)
111
+ self.feature = nn.Linear(config.hid_dim * token_num, config.out_dim)
112
+
113
+ def forward(self, features):
114
+ """
115
+ input: [batch, token_num, hidden_size], output: [batch, token_num * config.out_dim]
116
+ """
117
+
118
+ if self.transformer_flag:
119
+ features,_ = self.transformer(features)
120
+ if self.mlp_flag:
121
+ features = self.mlp(features)
122
+
123
+ if self.num_mlp>0:
124
+ # features = self.mlp2(features)
125
+ for _ in range(1):
126
+ for mlp in self.mlp2:
127
+ features = mlp(features)
128
+
129
+ features = self.feature(features.view(features.shape[0], -1))
130
+ return features #features.view(features.shape[0], -1)
131
+
net.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51eddf445c758f5d42e8f4783b1e8ecc5833db5a9bc170203478d73a761ddd6a
3
+ size 91379438
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ numpy
4
+ joblib
5
+ pytorch_transformers
6
+ scikit-learn
utils_MMD.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ def Pdist2(x, y):
4
+ """compute the paired distance between x and y."""
5
+ x_norm = (x ** 2).sum(1).view(-1, 1)
6
+ if y is not None:
7
+ y_norm = (y ** 2).sum(1).view(1, -1)
8
+ else:
9
+ y = x
10
+ y_norm = x_norm.view(1, -1)
11
+ Pdist = x_norm + y_norm - 2.0 * torch.mm(x, torch.transpose(y, 0, 1))
12
+ Pdist[Pdist<0]=0
13
+ return Pdist
14
+
15
+ def MMD_batch2(Fea, len_s, Fea_org, sigma, sigma0=0.1, epsilon = 10**(-10), is_smooth=True, is_var_computed=True, use_1sample_U=True, coeff_xy=2):
16
+ X = Fea[0:len_s, :]
17
+ Y = Fea[len_s:, :]
18
+ if is_smooth:
19
+ X_org = Fea_org[0:len_s, :]
20
+ Y_org = Fea_org[len_s:, :]
21
+ L = 1 # generalized Gaussian (if L>1)
22
+
23
+ nx = X.shape[0]
24
+ ny = Y.shape[0]
25
+ Dxx = Pdist2(X, X)
26
+ Dyy = torch.zeros(Fea.shape[0] - len_s, 1).to(Dxx.device)
27
+ # Dyy = Pdist2(Y, Y)
28
+ Dxy = Pdist2(X, Y).transpose(0,1)
29
+ if is_smooth:
30
+ Dxx_org = Pdist2(X_org, X_org)
31
+ Dyy_org = torch.zeros(Fea.shape[0] - len_s, 1).to(Dxx.device)
32
+ # Dyy_org = Pdist2(Y_org, Y_org) # 1,1 0
33
+ Dxy_org = Pdist2(X_org, Y_org).transpose(0,1)
34
+
35
+
36
+ if is_smooth:
37
+ Kx = (1-epsilon) * torch.exp(-(Dxx / sigma0)**L -Dxx_org / sigma) + epsilon * torch.exp(-Dxx_org / sigma)
38
+ Ky = (1-epsilon) * torch.exp(-(Dyy / sigma0)**L -Dyy_org / sigma) + epsilon * torch.exp(-Dyy_org / sigma)
39
+ Kxy = (1-epsilon) * torch.exp(-(Dxy / sigma0)**L -Dxy_org / sigma) + epsilon * torch.exp(-Dxy_org / sigma)
40
+ else:
41
+ Kx = torch.exp(-Dxx / sigma0)
42
+ Ky = torch.exp(-Dyy / sigma0)
43
+ Kxy = torch.exp(-Dxy / sigma0)
44
+
45
+ nx = Kx.shape[0]
46
+
47
+ is_unbiased = False
48
+ if 1:
49
+ xx = torch.div((torch.sum(Kx)), (nx * nx))
50
+ yy = Ky.reshape(-1)
51
+
52
+ # one-sample U-statistic.
53
+
54
+ xy = torch.div(torch.sum(Kxy, dim = 1), (nx ))
55
+
56
+ mmd2 = xx - 2 * xy + yy
57
+ return mmd2
58
+