Spaces:
Sleeping
Sleeping
cheesexuebao
commited on
Commit
•
8518918
1
Parent(s):
fcc1b6e
Gray Testing
Browse files- .gitattributes +1 -0
- .gitignore +3 -0
- Prediction.py +96 -0
- app.py +117 -39
- assets/Kickstarter_sentence_level_5000.csv +0 -0
- assets/Prediction.py.bak +129 -0
- assets/csv_examples.csv +30 -0
- assets/examples.txt +14 -0
- convert.py +28 -0
- {bert-base-uncased → models/All_Data}/config.json +16 -2
- {bert-base-uncased → models/All_Data}/pytorch_model.bin +2 -2
- {bert-base-uncased → models/All_Data}/vocab.txt +0 -0
- bert-base-uncased/bert_config.json → models/Facebook/config.json +24 -0
- models/Facebook/pytorch_model.bin +3 -0
- models/Facebook/vocab.txt +0 -0
- models/Kickstarter/config.json +37 -0
- models/Kickstarter/pytorch_model.bin +3 -0
- models/Kickstarter/vocab.txt +0 -0
- models/Twitter/config.json +37 -0
- models/Twitter/pytorch_model.bin +3 -0
- models/Twitter/vocab.txt +0 -0
- requirements.txt +2 -1
.gitattributes
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
* text=auto
|
2 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
3 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
4 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
/output/*
|
2 |
+
.vscode
|
3 |
+
__pycache__
|
Prediction.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from tqdm.auto import tqdm
|
3 |
+
import torch
|
4 |
+
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
|
5 |
+
import os
|
6 |
+
import glob
|
7 |
+
|
8 |
+
|
9 |
+
RANDOM_SEED = 42
|
10 |
+
pd.RANDOM_SEED = 42
|
11 |
+
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"]
|
12 |
+
|
13 |
+
|
14 |
+
@torch.no_grad()
|
15 |
+
def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
|
16 |
+
predictions = []
|
17 |
+
post = data[text_col]
|
18 |
+
num_text = len(post)
|
19 |
+
generator = range(0, num_text, text_bs)
|
20 |
+
for i in tqdm(generator, total=len(generator), desc="Processing..."):
|
21 |
+
texts = post[i: min(num_text, i+text_bs)].tolist()
|
22 |
+
encoding = tokenizer(
|
23 |
+
texts,
|
24 |
+
add_special_tokens=True,
|
25 |
+
max_length=max_token_len,
|
26 |
+
return_token_type_ids=False,
|
27 |
+
padding="max_length",
|
28 |
+
truncation=True,
|
29 |
+
return_attention_mask=True,
|
30 |
+
return_tensors='pt',
|
31 |
+
)
|
32 |
+
logits = model(
|
33 |
+
encoding["input_ids"].to(device),
|
34 |
+
encoding["attention_mask"].to(device),
|
35 |
+
return_dict=True
|
36 |
+
).logits
|
37 |
+
prediction = torch.sigmoid(logits)
|
38 |
+
predictions.append(prediction.detach().cpu())
|
39 |
+
|
40 |
+
final_pred = torch.cat(predictions, dim=0)
|
41 |
+
y_inten = final_pred.numpy().T
|
42 |
+
|
43 |
+
data[LABEL_COLUMNS[0]] = y_inten[0].tolist()
|
44 |
+
data[LABEL_COLUMNS[1]] = y_inten[1].tolist()
|
45 |
+
data[LABEL_COLUMNS[2]] = y_inten[2].tolist()
|
46 |
+
data[LABEL_COLUMNS[3]] = y_inten[3].tolist()
|
47 |
+
return data
|
48 |
+
|
49 |
+
@torch.no_grad()
|
50 |
+
def predict_single(sentence, tokenizer, model, device, max_token_len=128):
|
51 |
+
encoding = tokenizer(
|
52 |
+
sentence,
|
53 |
+
add_special_tokens=True,
|
54 |
+
max_length=max_token_len,
|
55 |
+
return_token_type_ids=False,
|
56 |
+
padding="max_length",
|
57 |
+
truncation=True,
|
58 |
+
return_attention_mask=True,
|
59 |
+
return_tensors='pt',
|
60 |
+
)
|
61 |
+
logits = model(
|
62 |
+
encoding["input_ids"].to(device),
|
63 |
+
encoding["attention_mask"].to(device),
|
64 |
+
return_dict=True
|
65 |
+
).logits
|
66 |
+
prediction = torch.sigmoid(logits)
|
67 |
+
y_inten = prediction.flatten().cpu().numpy().T.tolist()
|
68 |
+
return y_inten
|
69 |
+
|
70 |
+
def model_factory(local_path, device):
|
71 |
+
manager = {}
|
72 |
+
for model_path in glob.glob(f"{local_path}/*"):
|
73 |
+
base_name = os.path.basename(model_path)
|
74 |
+
model_name = os.path.splitext(base_name)[0]
|
75 |
+
tokenizer = BertTokenizer.from_pretrained(model_path)
|
76 |
+
model = BertForSequenceClassification.from_pretrained(model_path)
|
77 |
+
model = model.to(device)
|
78 |
+
manager[model_name] = {
|
79 |
+
"model": model,
|
80 |
+
"tokenizer": tokenizer
|
81 |
+
}
|
82 |
+
return manager
|
83 |
+
|
84 |
+
|
85 |
+
if __name__ == "__main__":
|
86 |
+
|
87 |
+
Data = pd.read_csv("Kickstarter_sentence_level_5000.csv")
|
88 |
+
Data = Data[:20]
|
89 |
+
device = torch.device('cpu')
|
90 |
+
|
91 |
+
manager = model_factory("./models", device)
|
92 |
+
for model_name, dct in manager.items():
|
93 |
+
model, tokenizer = dct['model'], dct['tokenizer']
|
94 |
+
fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
|
95 |
+
single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
|
96 |
+
fk_doc_result.to_csv(f"output/prediction_{model_name}.csv")
|
app.py
CHANGED
@@ -1,54 +1,132 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
-
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
df = pd.read_csv(csv_file.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
my_theme = gr.Theme.from_hub("gstaff/sketch")
|
30 |
-
with gr.Blocks(theme=my_theme, title='Test') as demo:
|
31 |
-
gr.Markdown("""# Test
|
32 |
-
xxxx
|
33 |
-
""")
|
34 |
|
35 |
with gr.Tab("Single Sentence"):
|
36 |
-
with gr.Column():
|
37 |
-
csv_input = gr.File(label="CSV文件")
|
38 |
-
text_output = gr.File(label="结果")
|
39 |
-
image_output = gr.Gallery(label="图像")
|
40 |
with gr.Row():
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
with gr.Row():
|
45 |
button = gr.Button("Submit", variant="primary")
|
46 |
-
button.click(fn=
|
47 |
-
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
|
|
|
|
|
|
54 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
+
from Prediction import *
|
5 |
+
import os
|
6 |
+
from datetime import datetime
|
7 |
|
8 |
+
|
9 |
+
examples = []
|
10 |
+
if os.path.exists("assets/examples.txt"):
|
11 |
+
with open("assets/examples.txt", "r", encoding="utf8") as file:
|
12 |
+
for sentence in file:
|
13 |
+
sentence = sentence.strip()
|
14 |
+
examples.append(sentence)
|
15 |
+
else:
|
16 |
+
examples = [
|
17 |
+
"Games of the imagination teach us actions have consequences in a realm that can be reset.",
|
18 |
+
"But New Jersey farmers are retiring and all over the state, development continues to push out dwindling farmland.",
|
19 |
+
"He also is the Head Designer of The Design Trust so-to-speak, besides his regular job ..."
|
20 |
+
]
|
21 |
+
|
22 |
+
device = torch.device('cpu')
|
23 |
+
manager = model_factory("./models", device)
|
24 |
+
|
25 |
+
|
26 |
+
def single_sentence(sentence, model_select):
|
27 |
+
df = []
|
28 |
+
for model_name in model_select:
|
29 |
+
dct = manager[model_name]
|
30 |
+
model, tokenizer = dct['model'], dct['tokenizer']
|
31 |
+
predictions = predict_single(sentence, tokenizer, model, device)
|
32 |
+
df.append([model_name] + predictions)
|
33 |
+
return df
|
34 |
+
|
35 |
+
def csv_process(csv_file, model_select, attr="content"):
|
36 |
+
current_time = datetime.now()
|
37 |
+
formatted_time = current_time.strftime("%Y_%m_%d_%H_%M_%S")
|
38 |
df = pd.read_csv(csv_file.name)
|
39 |
+
outputs = []
|
40 |
+
for model_name in model_select:
|
41 |
+
data = df.copy(deep=True)
|
42 |
+
dct = manager[model_name]
|
43 |
+
model, tokenizer = dct['model'], dct['tokenizer']
|
44 |
+
predictions = predict_csv(data, attr, tokenizer, model, device)
|
45 |
+
output_path = f"output/prediction_{model_name}_{formatted_time}.csv"
|
46 |
+
predictions.to_csv(output_path)
|
47 |
+
outputs.append(output_path)
|
48 |
+
return outputs
|
49 |
+
|
50 |
|
51 |
+
my_theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty")
|
52 |
+
with gr.Blocks(theme=my_theme, title='XXX') as demo:
|
53 |
+
gr.HTML(
|
54 |
+
"""
|
55 |
+
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
|
56 |
+
<a href="https://github.com/xxx" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
|
57 |
+
</a>
|
58 |
+
<div>
|
59 |
+
<h1 >Place the title of the paper here</h1>
|
60 |
+
<h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
|
61 |
+
<div style="display: flex; justify-content: center; align-items: center; text-align: center;>
|
62 |
+
<a href="https://arxiv.org/abs/xx.xx"><img src="https://img.shields.io/badge/Arxiv-xx.xx-red"></a>
|
63 |
+
<a href='https://huggingface.co/spaces/cheesexuebao/murphy'><img src='https://img.shields.io/badge/Project_Page-Murphy/xxBert' alt='Project Page'></a>
|
64 |
+
<a href='https://github.com'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
|
65 |
+
</div>
|
66 |
+
</div>
|
67 |
+
</div>
|
68 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
with gr.Tab("Single Sentence"):
|
|
|
|
|
|
|
|
|
71 |
with gr.Row():
|
72 |
+
tbox_input = gr.Textbox(label="Input",
|
73 |
+
info="Please input a sentence here:")
|
74 |
+
model_select = gr.CheckboxGroup(manager.keys(),
|
75 |
+
label="Models:",
|
76 |
+
info="Selecting different model variants to obtain aggregated predictions.")
|
77 |
+
tab_output = gr.DataFrame(label='Probability Predictions:',
|
78 |
+
headers=["model"] + LABEL_COLUMNS,
|
79 |
+
datatype=["str"] * (len(LABEL_COLUMNS)+1),
|
80 |
+
interactive=False,
|
81 |
+
wrap=True)
|
82 |
+
with gr.Row():
|
83 |
+
button_ss = gr.Button("Submit", variant="primary")
|
84 |
+
button_ss.click(fn=single_sentence, inputs=[tbox_input, model_select], outputs=[tab_output])
|
85 |
+
gr.ClearButton([tbox_input, tab_output])
|
86 |
+
|
87 |
+
gr.Markdown("## Examples")
|
88 |
+
gr.Examples(
|
89 |
+
examples=examples,
|
90 |
+
inputs=tbox_input,
|
91 |
+
examples_per_page=5
|
92 |
+
)
|
93 |
+
|
94 |
+
with gr.Tab("Csv File"):
|
95 |
+
with gr.Row():
|
96 |
+
csv_input = gr.File(label="CSV File:",
|
97 |
+
file_types=['.csv'],
|
98 |
+
file_count="single"
|
99 |
+
)
|
100 |
+
csv_output = gr.File(label="Predictions:")
|
101 |
+
|
102 |
+
model_select = gr.CheckboxGroup(manager.keys(),
|
103 |
+
label="Models:",
|
104 |
+
info="Selecting different model variants to obtain aggregated predictions.")
|
105 |
|
106 |
with gr.Row():
|
107 |
button = gr.Button("Submit", variant="primary")
|
108 |
+
button.click(fn=csv_process, inputs=[csv_input, model_select], outputs=[csv_output])
|
109 |
+
gr.ClearButton([csv_input, csv_output])
|
110 |
|
111 |
+
gr.Markdown("## Examples")
|
112 |
+
gr.Examples(
|
113 |
+
examples=["assets/csv_examples.csv",],
|
114 |
+
inputs=csv_input
|
115 |
+
)
|
116 |
+
|
117 |
+
with gr.Tab("Readme"):
|
118 |
+
gr.Markdown(
|
119 |
+
"""
|
120 |
+
# Paper Name
|
121 |
+
|
122 |
+
# Authors
|
123 |
+
|
124 |
+
+ First author
|
125 |
+
+ Corresponding author
|
126 |
+
|
127 |
+
# Detailed Information
|
128 |
|
129 |
+
...
|
130 |
+
"""
|
131 |
+
)
|
132 |
demo.launch()
|
assets/Kickstarter_sentence_level_5000.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/Prediction.py.bak
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### install the needed package
|
2 |
+
# !pip install transformers
|
3 |
+
# !pip install torchmetrics
|
4 |
+
# !pip3 install ogb pytorch_lightning -q
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
import pandas as pd
|
9 |
+
from tqdm.auto import tqdm
|
10 |
+
import torch
|
11 |
+
import torch.nn as nn
|
12 |
+
from torch.utils.data import DataLoader, Dataset
|
13 |
+
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
|
14 |
+
# import pytorch_lightning as pl
|
15 |
+
|
16 |
+
pd.set_option('display.max_columns', 500)
|
17 |
+
|
18 |
+
RANDOM_SEED = 42
|
19 |
+
|
20 |
+
|
21 |
+
class ModelTagger(nn.Module):
|
22 |
+
def __init__(self, model_path="bert-base-uncased"):
|
23 |
+
super().__init__()
|
24 |
+
|
25 |
+
self.bert = BertModel.from_pretrained(model_path, return_dict=True)
|
26 |
+
self.classifier = nn.Linear(self.bert.config.hidden_size, 4)
|
27 |
+
self.criterion = nn.BCELoss()
|
28 |
+
|
29 |
+
|
30 |
+
def forward(self, input_ids, attention_mask, labels=None):
|
31 |
+
|
32 |
+
output = self.bert(input_ids, attention_mask=attention_mask)
|
33 |
+
output = self.classifier(output.pooler_output)
|
34 |
+
output = torch.sigmoid(output)
|
35 |
+
loss = 0
|
36 |
+
|
37 |
+
if labels is not None:
|
38 |
+
loss = self.criterion(output, labels)
|
39 |
+
return loss, output
|
40 |
+
|
41 |
+
|
42 |
+
class Predict_Dataset(Dataset):
|
43 |
+
def __init__(
|
44 |
+
self,
|
45 |
+
data: pd.DataFrame,
|
46 |
+
text_col: str,
|
47 |
+
tokenizer: BertTokenizer,
|
48 |
+
max_token_len: int = 128
|
49 |
+
):
|
50 |
+
self.text_col = text_col
|
51 |
+
self.tokenizer = tokenizer
|
52 |
+
self.data = data
|
53 |
+
self.max_token_len = max_token_len
|
54 |
+
|
55 |
+
def __len__(self):
|
56 |
+
return len(self.data)
|
57 |
+
|
58 |
+
|
59 |
+
def __getitem__(self, index: int):
|
60 |
+
data_row = self.data.iloc[index]
|
61 |
+
post = data_row[self.text_col]
|
62 |
+
encoding = self.tokenizer.encode_plus(
|
63 |
+
post,
|
64 |
+
add_special_tokens=True,
|
65 |
+
max_length=self.max_token_len,
|
66 |
+
return_token_type_ids=False,
|
67 |
+
padding="max_length",
|
68 |
+
truncation=True,
|
69 |
+
return_attention_mask=True,
|
70 |
+
return_tensors='pt',
|
71 |
+
)
|
72 |
+
return dict(
|
73 |
+
post=post,
|
74 |
+
input_ids=encoding["input_ids"].flatten(),
|
75 |
+
attention_mask=encoding["attention_mask"].flatten(),
|
76 |
+
)
|
77 |
+
|
78 |
+
|
79 |
+
def predict(data, text_col, tokenizer, model, device, LABEL_COLUMNS, max_token_len=128):
|
80 |
+
predictions = []
|
81 |
+
|
82 |
+
df_token = Predict_Dataset(data, text_col, tokenizer, max_token_len=max_token_len)
|
83 |
+
loader = DataLoader(df_token, batch_size=1000, num_workers=0)
|
84 |
+
|
85 |
+
for item in tqdm(loader):
|
86 |
+
_, prediction = model(
|
87 |
+
item["input_ids"].to(device),
|
88 |
+
item["attention_mask"].to(device)
|
89 |
+
)
|
90 |
+
predictions.append(prediction.detach().cpu())
|
91 |
+
|
92 |
+
final_pred = torch.cat(predictions, dim=0)
|
93 |
+
y_inten = final_pred.numpy().T
|
94 |
+
|
95 |
+
return {
|
96 |
+
LABEL_COLUMNS[0]: y_inten[0].tolist(),
|
97 |
+
LABEL_COLUMNS[1]: y_inten[1].tolist(),
|
98 |
+
LABEL_COLUMNS[2]: y_inten[2].tolist(),
|
99 |
+
LABEL_COLUMNS[3]: y_inten[3].tolist()
|
100 |
+
}
|
101 |
+
|
102 |
+
|
103 |
+
def get_result(df, result, LABEL_COLUMNS):
|
104 |
+
df[LABEL_COLUMNS[0]] = result[LABEL_COLUMNS[0]]
|
105 |
+
df[LABEL_COLUMNS[1]] = result[LABEL_COLUMNS[1]]
|
106 |
+
df[LABEL_COLUMNS[2]] = result[LABEL_COLUMNS[2]]
|
107 |
+
df[LABEL_COLUMNS[3]] = result[LABEL_COLUMNS[3]]
|
108 |
+
return df
|
109 |
+
|
110 |
+
|
111 |
+
Data = pd.read_csv("Kickstarter_sentence_level_5000.csv")
|
112 |
+
Data = Data[:20]
|
113 |
+
device = torch.device('cpu')
|
114 |
+
BERT_MODEL_NAME = 'bert-base-uncased'
|
115 |
+
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
|
116 |
+
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"]
|
117 |
+
|
118 |
+
params = torch.load("checkpoints/Kickstarter.ckpt", map_location='cpu')['state_dict']
|
119 |
+
kick_model = ModelTagger()
|
120 |
+
kick_model.load_state_dict(params, strict=True)
|
121 |
+
kick_model.eval()
|
122 |
+
|
123 |
+
kick_model = kick_model.to(device)
|
124 |
+
|
125 |
+
kick_fk_doc_result = predict(Data,"content", tokenizer,kick_model, device, LABEL_COLUMNS)
|
126 |
+
|
127 |
+
fk_result = get_result(Data, kick_fk_doc_result, LABEL_COLUMNS)
|
128 |
+
|
129 |
+
fk_result.to_csv("output/prediction_origin_Kickstarter.csv")
|
assets/csv_examples.csv
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,index,content,word_count
|
2 |
+
0,225644,The first prototype did not clip together well and had strength issues so we redesigned it with new sides and a different tabs structure.,24
|
3 |
+
1,989071,Maybe you own a shop or perhaps you and your friends want to go in on this together to save some money.,22
|
4 |
+
2,332310,"With this campaign we want to propose ""Eternity Dice Regular and Charms Edition"", sculpted by hand in stone, with a polished finish and highly accurate details.",26
|
5 |
+
3,101474,"It's hand cut from a thick and reliable high quality calf skin, which is soft and flexible enough for wearing with utmost comfort.",23
|
6 |
+
4,1641986,"a# by 5 WHAT SEPARATES US FROM THE COMPETITION a lax-ll 360 AUDIO FLOATABLE Full submergable up to Superior surround sound Counter balanced for optimal 1 meter for 30 minutes audio direction while floating WIRELESS SPECIFICATIONS MATERIALS sarr of whreless Small and compact, with Engineered to perfection streaming range enormous sound with the highest quality materials avalable PRICE-POINT WARRANTY BVURABILITY Affordable technology Cone yearlimited warranty | Rubberized shock absorbing cover PATENTS BUILT-IN MIC BATTERY LIFE Patent.Pending stabalization .",78
|
7 |
+
5,1632938,Much of the known world is either from this culture or has converted to the faith.,16
|
8 |
+
6,1141502,"The more I play it, the more I want to play it.",12
|
9 |
+
7,1424712,"There are weapons all around you, you just never thought about your household goods that way.",16
|
10 |
+
8,460625,"In September, I'm going down to Virginia with a bunch of my music buddies to record the album.",18
|
11 |
+
9,179267,"It is suitable for use with Cthulhu, Horror, Space and Dungeon - style miniature games.",15
|
12 |
+
10,1092530,Games of the imagination teach us actions have consequences in a realm that can be reset.,16
|
13 |
+
11,1050585,"Intense cleaning of the existing space, brick repairs, and removal of unneeded materials is also necessary.",16
|
14 |
+
12,1126342,These will include color artwork and fully designed stats to help you build exciting and unique Shadowlands encounters.,18
|
15 |
+
13,277427,"If you're leaving the backpack unattended, the bag itself can be secured to almost any fixed object using the integrated steel wire and combination lock, making it impossible for opportunistic thieves to access your belongings or steal the bag, without special cutting equipment.",43
|
16 |
+
14,307425,Their parents had recruited the police and even had the church issuing official statements forbidding the girls to walk through monastery doors.,22
|
17 |
+
15,611566,is a childrenâs book for elementary school age kids with illustrations appealing to people of all ages.,17
|
18 |
+
16,951173,"Thanks to you we reached our original goal, so we got festival fees and insurance covered.",16
|
19 |
+
17,1294624,"Â It's been really well-received, and recently won an online award for Best New Tabletop Sports Game of 2013.",19
|
20 |
+
18,686912,"But New Jersey farmers are retiring and all over the state, development continues to push out dwindling farmland.",18
|
21 |
+
19,1291430,"Support Cards for easily setting initiative and keeping track of hit points, ammo, etc, speeding things up and eliminating the need for any writing/erasing Deep character creation with options designed for interesting roleplaying, and super fast to create (5 minutes or less) Specially laminated Character Cards take the place of the old character sheet, making information extremely easy to find and removing clutter from the gaming table Easily expandable without having to purchase and read through lengthy new books - new equipment, weapons, powers, skills, and opponents can be instantly added to your game with Setting Cards All special rules for equipment, weapons, powers, skills, and opponents printed on cards kept in player hands, so you never have to go searching for them Completely genre neutral, so assets from any setting are completely compatible with any others, making your game infinitely expandable and customizable Tech-themed Resolution Deck Concept Built from the ground up with VTTs (Virtual Table Tops) in mind, with all digital assets ready to drop into your game to integrate seamlessly with groups who play remotely Complete playable module with starter adventure included in backer rewards of $10 or more!",192
|
22 |
+
20,1656635,"Their bond of friendship makes the journey more important than the destination as they share their dreams, frustrations and fears. The story goes on to show the dramatic impact this innocent childhood adventure has on their young adult lives.",39
|
23 |
+
21,1679298,"He also is the Head Designer of The Design Trust so-to-speak, besides his regular job ...",16
|
24 |
+
22,337389,"This year, the film team has plans to produce a short comedy, based on a true story set in the city of Jerusalem.",23
|
25 |
+
23,980529,"$12,000 - Roguelike Player Mat This player mat will include extra rules to play Baldrick's Tomb as a solo player Roguelike.",21
|
26 |
+
24,1700094,_ Thank you for viewing the project!,7
|
27 |
+
25,420192,We appreciate your support and thank you for joining us in helping cause this mission stay in action.,18
|
28 |
+
26,1469419,It'll even be foil-wrapped like baseball cards!,7
|
29 |
+
27,105008,We believe that the major players with their massive branding campaigns together with the margins applied by distributors and retailers are a business model that doesnât deliver a fair value to customers.,32
|
30 |
+
28,1505209,"If you want to take advantage of the Rhino Slider's versatility, you'll have an option to add extra sets of rails after the campaign ends.",25
|
assets/examples.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Games of the imagination teach us actions have consequences in a realm that can be reset.
|
2 |
+
Intense cleaning of the existing space, brick repairs, and removal of unneeded materials is also necessary.
|
3 |
+
Thanks to you we reached our original goal, so we got festival fees and insurance covered.
|
4 |
+
 It's been really well-received, and recently won an online award for Best New Tabletop Sports Game of 2013.
|
5 |
+
But New Jersey farmers are retiring and all over the state, development continues to push out dwindling farmland.
|
6 |
+
Our chemical-free process provides unmatched comfort.
|
7 |
+
However, this chart does not factor in special ability influence since that varies with the ability being used.
|
8 |
+
I'd like to do something similar with pictures.
|
9 |
+
This means you can feel more than comfortable putting them in your back pocket or purse.
|
10 |
+
She holds a degree from the Advertising University of Madrid.
|
11 |
+
Skeleton Birds are heading to Groovebox Studios on March 17th to record and film a live GBS Detroit EP and video.
|
12 |
+
Please help support us & make this awesome case a reality!
|
13 |
+
So... We're asking for $3,000 per song.
|
14 |
+
You also have battle items and action cards to defeat your gnome enemies.
|
convert.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import glob
|
3 |
+
import os
|
4 |
+
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
|
5 |
+
|
6 |
+
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"]
|
7 |
+
|
8 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
9 |
+
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
|
10 |
+
id2label = {i:label for i,label in enumerate(LABEL_COLUMNS)}
|
11 |
+
label2id = {label:i for i,label in enumerate(LABEL_COLUMNS)}
|
12 |
+
|
13 |
+
for ckpt in glob.glob('checkpoints/*.ckpt'):
|
14 |
+
base_name = os.path.basename(ckpt)
|
15 |
+
# 去除文件后缀
|
16 |
+
model_name = os.path.splitext(base_name)[0]
|
17 |
+
params = torch.load(ckpt, map_location="cpu")['state_dict']
|
18 |
+
msg = model.load_state_dict(params, strict=True)
|
19 |
+
path = f'models/{model_name}'
|
20 |
+
os.makedirs(path, exist_ok=True)
|
21 |
+
|
22 |
+
torch.save(model.state_dict(), f'{path}/pytorch_model.bin')
|
23 |
+
config = model.config
|
24 |
+
config.architectures = ['BertForSequenceClassification']
|
25 |
+
config.label2id = label2id
|
26 |
+
config.id2label = id2label
|
27 |
+
model.config.to_json_file(f'{path}/config.json')
|
28 |
+
tokenizer.save_vocabulary(path)
|
{bert-base-uncased → models/All_Data}/config.json
RENAMED
@@ -1,14 +1,28 @@
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
-
"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
|
|
6 |
"gradient_checkpointing": false,
|
7 |
"hidden_act": "gelu",
|
8 |
"hidden_dropout_prob": 0.1,
|
9 |
"hidden_size": 768,
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
"initializer_range": 0.02,
|
11 |
"intermediate_size": 3072,
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"layer_norm_eps": 1e-12,
|
13 |
"max_position_embeddings": 512,
|
14 |
"model_type": "bert",
|
@@ -16,7 +30,7 @@
|
|
16 |
"num_hidden_layers": 12,
|
17 |
"pad_token_id": 0,
|
18 |
"position_embedding_type": "absolute",
|
19 |
-
"transformers_version": "4.
|
20 |
"type_vocab_size": 2,
|
21 |
"use_cache": true,
|
22 |
"vocab_size": 30522
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
"gradient_checkpointing": false,
|
9 |
"hidden_act": "gelu",
|
10 |
"hidden_dropout_prob": 0.1,
|
11 |
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "Assertive Tone",
|
14 |
+
"1": "Conversational Tone",
|
15 |
+
"2": "Emotional Tone",
|
16 |
+
"3": "Informative Tone"
|
17 |
+
},
|
18 |
"initializer_range": 0.02,
|
19 |
"intermediate_size": 3072,
|
20 |
+
"label2id": {
|
21 |
+
"Assertive Tone": 0,
|
22 |
+
"Conversational Tone": 1,
|
23 |
+
"Emotional Tone": 2,
|
24 |
+
"Informative Tone": 3
|
25 |
+
},
|
26 |
"layer_norm_eps": 1e-12,
|
27 |
"max_position_embeddings": 512,
|
28 |
"model_type": "bert",
|
|
|
30 |
"num_hidden_layers": 12,
|
31 |
"pad_token_id": 0,
|
32 |
"position_embedding_type": "absolute",
|
33 |
+
"transformers_version": "4.36.2",
|
34 |
"type_vocab_size": 2,
|
35 |
"use_cache": true,
|
36 |
"vocab_size": 30522
|
{bert-base-uncased → models/All_Data}/pytorch_model.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4edf18d14298c9a7057bbbdbc88cddf3b673e452103c6c4b882e1cec14d51c53
|
3 |
+
size 438021294
|
{bert-base-uncased → models/All_Data}/vocab.txt
RENAMED
File without changes
|
bert-base-uncased/bert_config.json → models/Facebook/config.json
RENAMED
@@ -1,13 +1,37 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
2 |
"attention_probs_dropout_prob": 0.1,
|
|
|
|
|
3 |
"hidden_act": "gelu",
|
4 |
"hidden_dropout_prob": 0.1,
|
5 |
"hidden_size": 768,
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"initializer_range": 0.02,
|
7 |
"intermediate_size": 3072,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"max_position_embeddings": 512,
|
|
|
9 |
"num_attention_heads": 12,
|
10 |
"num_hidden_layers": 12,
|
|
|
|
|
|
|
11 |
"type_vocab_size": 2,
|
|
|
12 |
"vocab_size": 30522
|
13 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
"hidden_act": "gelu",
|
10 |
"hidden_dropout_prob": 0.1,
|
11 |
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "Assertive Tone",
|
14 |
+
"1": "Conversational Tone",
|
15 |
+
"2": "Emotional Tone",
|
16 |
+
"3": "Informative Tone"
|
17 |
+
},
|
18 |
"initializer_range": 0.02,
|
19 |
"intermediate_size": 3072,
|
20 |
+
"label2id": {
|
21 |
+
"Assertive Tone": 0,
|
22 |
+
"Conversational Tone": 1,
|
23 |
+
"Emotional Tone": 2,
|
24 |
+
"Informative Tone": 3
|
25 |
+
},
|
26 |
+
"layer_norm_eps": 1e-12,
|
27 |
"max_position_embeddings": 512,
|
28 |
+
"model_type": "bert",
|
29 |
"num_attention_heads": 12,
|
30 |
"num_hidden_layers": 12,
|
31 |
+
"pad_token_id": 0,
|
32 |
+
"position_embedding_type": "absolute",
|
33 |
+
"transformers_version": "4.36.2",
|
34 |
"type_vocab_size": 2,
|
35 |
+
"use_cache": true,
|
36 |
"vocab_size": 30522
|
37 |
}
|
models/Facebook/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f511b8b4b91b5fa408c5b3220ce0fe9b61b2f9a3a54dd00acb3a81aa0a2a19e8
|
3 |
+
size 438021294
|
models/Facebook/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/Kickstarter/config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "Assertive Tone",
|
14 |
+
"1": "Conversational Tone",
|
15 |
+
"2": "Emotional Tone",
|
16 |
+
"3": "Informative Tone"
|
17 |
+
},
|
18 |
+
"initializer_range": 0.02,
|
19 |
+
"intermediate_size": 3072,
|
20 |
+
"label2id": {
|
21 |
+
"Assertive Tone": 0,
|
22 |
+
"Conversational Tone": 1,
|
23 |
+
"Emotional Tone": 2,
|
24 |
+
"Informative Tone": 3
|
25 |
+
},
|
26 |
+
"layer_norm_eps": 1e-12,
|
27 |
+
"max_position_embeddings": 512,
|
28 |
+
"model_type": "bert",
|
29 |
+
"num_attention_heads": 12,
|
30 |
+
"num_hidden_layers": 12,
|
31 |
+
"pad_token_id": 0,
|
32 |
+
"position_embedding_type": "absolute",
|
33 |
+
"transformers_version": "4.36.2",
|
34 |
+
"type_vocab_size": 2,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 30522
|
37 |
+
}
|
models/Kickstarter/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b98553cd5a9b23babc4e20ade9abda931497de3103acf09656eb39cfcbb0c485
|
3 |
+
size 438021294
|
models/Kickstarter/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/Twitter/config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "Assertive Tone",
|
14 |
+
"1": "Conversational Tone",
|
15 |
+
"2": "Emotional Tone",
|
16 |
+
"3": "Informative Tone"
|
17 |
+
},
|
18 |
+
"initializer_range": 0.02,
|
19 |
+
"intermediate_size": 3072,
|
20 |
+
"label2id": {
|
21 |
+
"Assertive Tone": 0,
|
22 |
+
"Conversational Tone": 1,
|
23 |
+
"Emotional Tone": 2,
|
24 |
+
"Informative Tone": 3
|
25 |
+
},
|
26 |
+
"layer_norm_eps": 1e-12,
|
27 |
+
"max_position_embeddings": 512,
|
28 |
+
"model_type": "bert",
|
29 |
+
"num_attention_heads": 12,
|
30 |
+
"num_hidden_layers": 12,
|
31 |
+
"pad_token_id": 0,
|
32 |
+
"position_embedding_type": "absolute",
|
33 |
+
"transformers_version": "4.36.2",
|
34 |
+
"type_vocab_size": 2,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 30522
|
37 |
+
}
|
models/Twitter/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6abf83c8c66c4f3fcaba340dcab3b5b1f4f2b66381b21a5aacab086194cf0cbd
|
3 |
+
size 438021294
|
models/Twitter/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
torch
|
2 |
transformers
|
3 |
tqdm
|
4 |
-
pandas
|
|
|
|
1 |
torch
|
2 |
transformers
|
3 |
tqdm
|
4 |
+
pandas
|
5 |
+
datetime
|