Jiahuita commited on
Commit
84938ff
1 Parent(s): 700431a

modified pipeline and config according to model summary

Browse files
Files changed (2) hide show
  1. config.json +2 -2
  2. pipeline.py +17 -7
config.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae409354ee5a0f6edfd67b5b838c072be95c352a1e1faca73a2473ee8ac15253
3
- size 286
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca780bf53be3df893073f50ad0e8218deae4f6c157eb3184480b43e3e86841fb
3
+ size 323
pipeline.py CHANGED
@@ -10,14 +10,16 @@ class NewsClassifierConfig(PretrainedConfig):
10
 
11
  def __init__(
12
  self,
13
- max_length=128,
14
- vocab_size=10000,
15
- hidden_size=64,
 
16
  num_labels=2,
17
  **kwargs
18
  ):
19
  self.max_length = max_length
20
  self.vocab_size = vocab_size
 
21
  self.hidden_size = hidden_size
22
  self.num_labels = num_labels
23
  super().__init__(**kwargs)
@@ -28,26 +30,34 @@ class NewsClassifier(PreTrainedModel):
28
 
29
  def __init__(self, config):
30
  super().__init__(config)
 
 
 
 
 
31
  self.model = load_model('news_classifier.h5')
32
  with open('tokenizer.json', 'r') as f:
33
  tokenizer_data = json.load(f)
34
  self.tokenizer = tokenizer_from_json(tokenizer_data)
35
 
36
  def forward(self, text_input):
 
 
 
37
  if isinstance(text_input, str):
38
  text_input = [text_input]
39
 
40
  sequences = self.tokenizer.texts_to_sequences(text_input)
41
  padded = pad_sequences(sequences, maxlen=self.config.max_length)
42
- predictions = self.model.predict(padded)
43
 
44
  results = []
45
  for pred in predictions:
46
- label = "foxnews" if pred[0] > 0.5 else "nbc"
47
- score = float(pred[0] if label == "foxnews" else 1 - pred[0])
48
  results.append({
49
  "label": label,
50
- "score": score
51
  })
52
 
53
  return results[0] if len(text_input) == 1 else results
 
10
 
11
  def __init__(
12
  self,
13
+ max_length=41, # Modified to match model input shape
14
+ vocab_size=74934, # Modified based on embedding layer size
15
+ embedding_dim=128, # Added to match model architecture
16
+ hidden_size=64, # Matches final LSTM layer
17
  num_labels=2,
18
  **kwargs
19
  ):
20
  self.max_length = max_length
21
  self.vocab_size = vocab_size
22
+ self.embedding_dim = embedding_dim
23
  self.hidden_size = hidden_size
24
  self.num_labels = num_labels
25
  super().__init__(**kwargs)
 
30
 
31
  def __init__(self, config):
32
  super().__init__(config)
33
+ self.model = None
34
+ self.tokenizer = None
35
+
36
+ def post_init(self):
37
+ """Load model and tokenizer after initialization"""
38
  self.model = load_model('news_classifier.h5')
39
  with open('tokenizer.json', 'r') as f:
40
  tokenizer_data = json.load(f)
41
  self.tokenizer = tokenizer_from_json(tokenizer_data)
42
 
43
  def forward(self, text_input):
44
+ if not self.model or not self.tokenizer:
45
+ self.post_init()
46
+
47
  if isinstance(text_input, str):
48
  text_input = [text_input]
49
 
50
  sequences = self.tokenizer.texts_to_sequences(text_input)
51
  padded = pad_sequences(sequences, maxlen=self.config.max_length)
52
+ predictions = self.model.predict(padded, verbose=0)
53
 
54
  results = []
55
  for pred in predictions:
56
+ score = float(pred[1])
57
+ label = "foxnews" if score > 0.5 else "nbc"
58
  results.append({
59
  "label": label,
60
+ "score": score if label == "foxnews" else 1 - score
61
  })
62
 
63
  return results[0] if len(text_input) == 1 else results