Upload 12 files
Browse files- LSTM.h5 +3 -0
- Logistic_Model.joblib +3 -0
- SVM_Linear_Kernel.joblib +3 -0
- app.py +305 -0
- fine_tuned_bert_model1.pth +3 -0
- requirements.txt +9 -0
- svm_model.joblib +3 -0
- tokenizer.joblib +3 -0
- tokenizer_bert/special_tokens_map.json +7 -0
- tokenizer_bert/tokenizer_config.json +57 -0
- tokenizer_bert/vocab.txt +0 -0
- vectorizer.joblib +3 -0
LSTM.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d6fba55ed617b95fc7c1f5322f3878bed3fe7d4962b8a263aaf2f1a16c16970
|
3 |
+
size 341626144
|
Logistic_Model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:190bc17002c82fd3a418d7cd9835ec4a590bc45eb1964d52e625a836bda1a6a9
|
3 |
+
size 400895
|
SVM_Linear_Kernel.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d56dda49b361a586459a3fb2d79f1fbffa65ccd78c8bf208ab37abdf0515ccc7
|
3 |
+
size 400764
|
app.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
import streamlit as st
|
3 |
+
import json
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from datetime import date
|
7 |
+
from tensorflow.keras.models import load_model
|
8 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
9 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
10 |
+
import numpy as np
|
11 |
+
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
12 |
+
import torch
|
13 |
+
|
14 |
+
# load all the models and vectorizer (global vocabulary)
|
15 |
+
Seq_model = load_model("LSTM.h5") # Sequential
|
16 |
+
SVM_model = joblib.load("SVM_Linear_Kernel.joblib") # SVM
|
17 |
+
logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
|
18 |
+
svm_model = joblib.load('svm_model.joblib')
|
19 |
+
|
20 |
+
vectorizer = joblib.load("vectorizer.joblib") # global vocabulary (used for Logistic, SVC)
|
21 |
+
tokenizer = joblib.load("tokenizer.joblib") # used for LSTM
|
22 |
+
|
23 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
24 |
+
tokenizer1 = DistilBertTokenizer.from_pretrained("tokenizer_bert")
|
25 |
+
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
|
26 |
+
model.load_state_dict(torch.load("fine_tuned_bert_model1.pth", map_location=device))
|
27 |
+
|
28 |
+
# Decode label function
|
29 |
+
# {'business': 0, 'entertainment': 1, 'health': 2, 'politics': 3, 'sport': 4}
|
30 |
+
def decodedLabel(input_number):
|
31 |
+
print('receive label encoded', input_number)
|
32 |
+
categories = {
|
33 |
+
0: 'Business',
|
34 |
+
1: 'Entertainment',
|
35 |
+
2: 'Health',
|
36 |
+
3: 'Politics',
|
37 |
+
4: 'Sport'
|
38 |
+
}
|
39 |
+
result = categories.get(input_number) # Ex: Health
|
40 |
+
print('decoded result', result)
|
41 |
+
return result
|
42 |
+
|
43 |
+
# Web Crawler function
|
44 |
+
def crawURL(url):
|
45 |
+
# Fetch the URL content
|
46 |
+
response = requests.get(url)
|
47 |
+
# Parse the sitemap HTML
|
48 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
49 |
+
|
50 |
+
# Find all anchor tags that are children of span tags with class 'sitemap-link'
|
51 |
+
urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
|
52 |
+
|
53 |
+
# Crawl pages and extract data
|
54 |
+
try:
|
55 |
+
print(f"Crawling page: {url}")
|
56 |
+
# Fetch page content
|
57 |
+
page_response = requests.get(url)
|
58 |
+
page_content = page_response.content
|
59 |
+
|
60 |
+
# Parse page content with BeautifulSoup
|
61 |
+
soup = BeautifulSoup(page_content, 'html.parser')
|
62 |
+
|
63 |
+
# Extract data you need from the page
|
64 |
+
author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
|
65 |
+
date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
|
66 |
+
article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
|
67 |
+
url = soup.find("meta", {"property": "og:url"}).attrs['content']
|
68 |
+
headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
|
69 |
+
description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
|
70 |
+
keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
|
71 |
+
text = soup.find(itemprop="articleBody")
|
72 |
+
# Find all <p> tags with class "paragraph inline-placeholder"
|
73 |
+
paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
|
74 |
+
|
75 |
+
# Initialize an empty list to store the text content of each paragraph
|
76 |
+
paragraph_texts = []
|
77 |
+
|
78 |
+
# Iterate over each <p> tag and extract its text content
|
79 |
+
for paragraph in paragraphs:
|
80 |
+
paragraph_texts.append(paragraph.text.strip())
|
81 |
+
|
82 |
+
# Join the text content of all paragraphs into a single string
|
83 |
+
full_text = ''.join(paragraph_texts)
|
84 |
+
return full_text
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Failed to crawl page: {url}, Error: {str(e)}")
|
88 |
+
return None
|
89 |
+
|
90 |
+
# Predict for text category by Models
|
91 |
+
def process_api(text):
|
92 |
+
# Vectorize the text data
|
93 |
+
processed_text = vectorizer.transform([text])
|
94 |
+
sequence = tokenizer.texts_to_sequences([text])
|
95 |
+
padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
|
96 |
+
|
97 |
+
new_encoding = tokenizer1([text], truncation=True, padding=True, return_tensors="pt")
|
98 |
+
input_ids = new_encoding['input_ids']
|
99 |
+
attention_mask = new_encoding['attention_mask']
|
100 |
+
with torch.no_grad():
|
101 |
+
output = model(input_ids, attention_mask=attention_mask)
|
102 |
+
logits = output.logits
|
103 |
+
|
104 |
+
# Get the predicted result from models
|
105 |
+
Logistic_Predicted = logistic_model.predict(processed_text).tolist() # Logistic Model
|
106 |
+
SVM_Predicted = SVM_model.predict(processed_text).tolist() # SVC Model
|
107 |
+
Seq_Predicted = Seq_model.predict(padded_sequence)
|
108 |
+
predicted_label_index = np.argmax(Seq_Predicted)
|
109 |
+
|
110 |
+
# ----------- Proba -----------
|
111 |
+
Logistic_Predicted_proba = logistic_model.predict_proba(processed_text)
|
112 |
+
svm_new_probs = SVM_model.decision_function(processed_text)
|
113 |
+
svm_probs = svm_model.predict_proba(svm_new_probs)
|
114 |
+
predicted_label_index = np.argmax(Seq_Predicted)
|
115 |
+
|
116 |
+
bert_probabilities = torch.softmax(logits, dim=1)
|
117 |
+
max_probability = torch.max(bert_probabilities).item()
|
118 |
+
predicted_label_bert = torch.argmax(logits, dim=1).item()
|
119 |
+
# ----------- Debug Logs -----------
|
120 |
+
logistic_debug = decodedLabel(int(Logistic_Predicted[0]))
|
121 |
+
svc_debug = decodedLabel(int(SVM_Predicted[0]))
|
122 |
+
# predicted_label_index = np.argmax(Seq_Predicted)
|
123 |
+
#print('Logistic', int(Logistic_Predicted[0]), logistic_debug)
|
124 |
+
#print('SVM', int(SVM_Predicted[0]), svc_debug)
|
125 |
+
|
126 |
+
return {
|
127 |
+
'predicted_label_logistic': decodedLabel(int(Logistic_Predicted[0])),
|
128 |
+
'probability_logistic': f"{int(float(np.max(Logistic_Predicted_proba))*10000//100)}%",
|
129 |
+
|
130 |
+
'predicted_label_svm': decodedLabel(int(SVM_Predicted[0])),
|
131 |
+
'probability_svm': f"{int(float(np.max(svm_probs))*10000//100)}%",
|
132 |
+
|
133 |
+
'predicted_label_lstm': decodedLabel(int(predicted_label_index)),
|
134 |
+
'probability_lstm': f"{int(float(np.max(Seq_Predicted))*10000//100)}%",
|
135 |
+
|
136 |
+
'predicted_label_bert': decodedLabel(int(predicted_label_bert)),
|
137 |
+
'probability_bert': f"{int(float(max_probability)*10000//100)}%",
|
138 |
+
|
139 |
+
'Article_Content': text
|
140 |
+
}
|
141 |
+
|
142 |
+
# Init web crawling, process article content by Model and return result as JSON
|
143 |
+
def categorize(url):
|
144 |
+
try:
|
145 |
+
article_content = crawURL(url)
|
146 |
+
result = process_api(article_content)
|
147 |
+
return result
|
148 |
+
except Exception as error:
|
149 |
+
if hasattr(error, 'message'):
|
150 |
+
return {"error_message": error.message}
|
151 |
+
else:
|
152 |
+
return {"error_message": error}
|
153 |
+
|
154 |
+
|
155 |
+
# Main App
|
156 |
+
st.title('Instant Category Classification')
|
157 |
+
st.write("Unsure what category a CNN article belongs to? Our clever tool can help! Paste the URL below and press Enter. We'll sort it into one of our 5 categories in a flash! ⚡️")
|
158 |
+
|
159 |
+
# Define category information (modify content and bullet points as needed)
|
160 |
+
categories = {
|
161 |
+
"Business": [
|
162 |
+
"Analyze market trends and investment opportunities.",
|
163 |
+
"Gain insights into company performance and industry news.",
|
164 |
+
"Stay informed about economic developments and regulations."
|
165 |
+
],
|
166 |
+
"Health": [
|
167 |
+
"Discover healthy recipes and exercise tips.",
|
168 |
+
"Learn about the latest medical research and advancements.",
|
169 |
+
"Find resources for managing chronic conditions and improving well-being."
|
170 |
+
],
|
171 |
+
"Sport": [
|
172 |
+
"Follow your favorite sports teams and athletes.",
|
173 |
+
"Explore news and analysis from various sports categories.",
|
174 |
+
"Stay updated on upcoming games and competitions."
|
175 |
+
],
|
176 |
+
"Politics": [
|
177 |
+
"Get informed about current political events and policies.",
|
178 |
+
"Understand different perspectives on political issues.",
|
179 |
+
"Engage in discussions and debates about politics."
|
180 |
+
],
|
181 |
+
"Entertainment": [
|
182 |
+
"Find recommendations for movies, TV shows, and music.",
|
183 |
+
"Explore reviews and insights from entertainment critics.",
|
184 |
+
"Stay updated on celebrity news and cultural trends."
|
185 |
+
]
|
186 |
+
}
|
187 |
+
|
188 |
+
# Define model information (modify descriptions as needed)
|
189 |
+
models = {
|
190 |
+
"Logistic Regression": "A widely used statistical method for classification problems. It excels at identifying linear relationships between features and the target variable.",
|
191 |
+
"SVC (Support Vector Classifier)": "A powerful machine learning model that seeks to find a hyperplane that best separates data points of different classes. It's effective for high-dimensional data and can handle some non-linear relationships.",
|
192 |
+
"LSTM (Long Short-Term Memory)": "A type of recurrent neural network (RNN) particularly well-suited for sequential data like text or time series. LSTMs can effectively capture long-term dependencies within the data.",
|
193 |
+
"BERT (Bidirectional Encoder Representations from Transformers)": "A powerful pre-trained model based on the Transformer architecture. It excels at understanding the nuances of language and can be fine-tuned for various NLP tasks like text classification."
|
194 |
+
}
|
195 |
+
|
196 |
+
|
197 |
+
# CNN URL Example List
|
198 |
+
URL_Example = [
|
199 |
+
'https://edition.cnn.com/2012/01/31/health/frank-njenga-mental-health/index.html',
|
200 |
+
'https://edition.cnn.com/2024/04/30/entertainment/barbra-streisand-melissa-mccarthy-ozempic/index.html',
|
201 |
+
'https://edition.cnn.com/2024/04/30/sport/lebron-james-lakers-future-nba-spt-intl/index.html',
|
202 |
+
'https://edition.cnn.com/2024/04/30/business/us-home-prices-rose-in-february/index.html'
|
203 |
+
]
|
204 |
+
|
205 |
+
# Create expanders containing list of categories can be classified
|
206 |
+
with st.expander("Category List"):
|
207 |
+
# Title for each category
|
208 |
+
st.subheader("Available Categories:")
|
209 |
+
for category in categories.keys():
|
210 |
+
st.write(f"- {category}")
|
211 |
+
# Content for each category (separated by a horizontal line)
|
212 |
+
st.write("---")
|
213 |
+
for category, content in categories.items():
|
214 |
+
st.subheader(category)
|
215 |
+
for item in content:
|
216 |
+
st.write(f"- {item}")
|
217 |
+
|
218 |
+
|
219 |
+
# Create expanders containing list of models used in this project
|
220 |
+
with st.expander("Available Models"):
|
221 |
+
st.subheader("List of Models:")
|
222 |
+
for model_name in models.keys():
|
223 |
+
st.write(f"- {model_name}")
|
224 |
+
st.write("---")
|
225 |
+
for model_name, description in models.items():
|
226 |
+
st.subheader(model_name)
|
227 |
+
st.write(description)
|
228 |
+
|
229 |
+
with st.expander("URLs Example"):
|
230 |
+
for url in URL_Example:
|
231 |
+
st.write(f"- {url}")
|
232 |
+
|
233 |
+
# Explain to user why this project is only worked for CNN domain
|
234 |
+
with st.expander("Tips", expanded=True):
|
235 |
+
st.write(
|
236 |
+
'''
|
237 |
+
This project works best with CNN articles right now.
|
238 |
+
Our web crawler is like a special tool for CNN's website.
|
239 |
+
It can't quite understand other websites because they're built differently
|
240 |
+
'''
|
241 |
+
)
|
242 |
+
|
243 |
+
st.divider() # 👈 Draws a horizontal rule
|
244 |
+
|
245 |
+
st.title('Dive in! See what category your CNN story belongs to 😉.')
|
246 |
+
# Paste URL Input
|
247 |
+
url = st.text_input("Find your favorite CNN story! Paste the URL and press ENTER 🔍.", placeholder='Ex: https://edition.cnn.com/2012/01/31/health/frank-njenga-mental-health/index.html')
|
248 |
+
|
249 |
+
if url:
|
250 |
+
st.divider() # 👈 Draws a horizontal rule
|
251 |
+
result = categorize(url)
|
252 |
+
article_content = result.get('Article_Content')
|
253 |
+
st.title('Article Content Fetched')
|
254 |
+
st.text_area("", value=article_content, height=400) # render the article content as textarea element
|
255 |
+
st.divider() # 👈 Draws a horizontal rule
|
256 |
+
st.title('Predicted Results')
|
257 |
+
st.json({
|
258 |
+
"Logistic": {
|
259 |
+
"predicted_label": result.get("predicted_label_logistic"),
|
260 |
+
"probability": result.get("probability_logistic")
|
261 |
+
},
|
262 |
+
"SVC": {
|
263 |
+
"predicted_label": result.get("predicted_label_svm"),
|
264 |
+
"probability": result.get("probability_svm")
|
265 |
+
},
|
266 |
+
"LSTM": {
|
267 |
+
"predicted_label": result.get("predicted_label_lstm"),
|
268 |
+
"probability": result.get("probability_lstm")
|
269 |
+
},
|
270 |
+
"BERT": {
|
271 |
+
"predicted_label": result.get("predicted_label_bert"),
|
272 |
+
"probability": result.get("probability_bert")
|
273 |
+
}
|
274 |
+
})
|
275 |
+
|
276 |
+
st.divider() # 👈 Draws a horizontal rule
|
277 |
+
|
278 |
+
# Category labels and corresponding counts
|
279 |
+
categories = ["Sport", "Health", "Entertainment", "Politics", "Business"]
|
280 |
+
counts = [5638, 4547, 2658, 2461, 1362]
|
281 |
+
|
282 |
+
# Optional: Add a chart title
|
283 |
+
st.title("Training Data Category Distribution")
|
284 |
+
|
285 |
+
# Optional: Display additional information
|
286 |
+
st.write("Here's a breakdown of the number of articles in each category:")
|
287 |
+
for category, count in zip(categories, counts):
|
288 |
+
st.write(f"- {category}: {count}")
|
289 |
+
|
290 |
+
# Create the bar chart
|
291 |
+
st.bar_chart(data=dict(zip(categories, counts)))
|
292 |
+
|
293 |
+
st.divider() # 👈 Draws a horizontal rule
|
294 |
+
|
295 |
+
# ------------ Copyright Section ------------
|
296 |
+
# Get the current year
|
297 |
+
current_year = date.today().year
|
298 |
+
# Format the copyright statement with dynamic year
|
299 |
+
copyright_text = f"Copyright © {current_year}"
|
300 |
+
st.title(copyright_text)
|
301 |
+
author_names = ["Trần Thanh Phước (Mentor)", "Lương Ngọc Phương (Member)", "Trịnh Cẩm Minh (Member)"]
|
302 |
+
st.write("Meet the minds behind the work!")
|
303 |
+
for author in author_names:
|
304 |
+
if (author == "Trịnh Cẩm Minh (Member)"): st.markdown("- [Trịnh Cẩm Minh (Member)](https://minhct.netlify.app/)")
|
305 |
+
else: st.markdown(f"- {author}\n") # Use f-string for bullet and newline
|
fine_tuned_bert_model1.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9666f4dc527e68a8ad8f528b0b946d86fc05f5cdf151cc35cd92b71932e22095
|
3 |
+
size 267872438
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tensorflow==2.15.0
|
2 |
+
joblib
|
3 |
+
scikit-learn
|
4 |
+
transformers==4.40.1
|
5 |
+
streamlit
|
6 |
+
numpy
|
7 |
+
requests
|
8 |
+
beautifulsoup4
|
9 |
+
torch
|
svm_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5de24bdba9f805bde0ac379c14ab68f45da8419a0349f87c8ae1596766173f15
|
3 |
+
size 1135
|
tokenizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89848b480e0cc8f4555bc6b9d79b9fd2f369a4b4d2dd2247561c16b7dfabf7f9
|
3 |
+
size 20743621
|
tokenizer_bert/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer_bert/tokenizer_config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "DistilBertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
+
}
|
tokenizer_bert/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vectorizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fe068fd0f45e7dcef9baebd5d6813ab5d730cbd7018030728c6b00e66e03acc
|
3 |
+
size 4122050
|