gagan3012 commited on
Commit
3f8d76d
1 Parent(s): 6730e31

added params

Browse files
params.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ data: cnn_dailymail
2
+ batch_size: 4
3
+ num_workers: 2
4
+ model_type: t5
5
+ model_name: t5-base
6
+ learning_rate: 1e-4
7
+ epochs: 5
8
+ source_dir: src
9
+ model_dir: models
10
+ metric: rouge
src/data/make_dataset.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from datasets import load_dataset
2
  import pandas as pd
3
 
@@ -8,10 +9,13 @@ def make_dataset(dataset='cnn_dailymail', split='train'):
8
  df = pd.DataFrame()
9
  df['article'] = dataset['article']
10
  df['highlights'] = dataset['highlights']
11
- df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/raw/{}.csv'.format(split))
12
 
13
 
14
  if __name__ == '__main__':
15
- make_dataset(dataset='cnn_dailymail', split='train')
16
- make_dataset(dataset='cnn_dailymail', split='test')
17
- make_dataset(dataset='cnn_dailymail', split='validation')
 
 
 
 
1
+ import yaml
2
  from datasets import load_dataset
3
  import pandas as pd
4
 
 
9
  df = pd.DataFrame()
10
  df['article'] = dataset['article']
11
  df['highlights'] = dataset['highlights']
12
+ df.to_csv('data/raw/{}.csv'.format(split))
13
 
14
 
15
  if __name__ == '__main__':
16
+ with open("params.yml") as f:
17
+ params = yaml.safe_load(f)
18
+
19
+ make_dataset(dataset=params['data'], split='train')
20
+ make_dataset(dataset=params['data'], split='test')
21
+ make_dataset(dataset=params['data'], split='validation')
src/data/process_data.py CHANGED
@@ -2,10 +2,10 @@ import pandas as pd
2
 
3
 
4
  def process_data(split='train'):
5
- df = pd.read_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/raw/{}.csv'.format(split))
 
6
  df.columns = ['Unnamed: 0', 'input_text', 'output_text']
7
- print(df.columns)
8
- df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split))
9
 
10
 
11
  if __name__ == '__main__':
 
2
 
3
 
4
  def process_data(split='train'):
5
+
6
+ df = pd.read_csv('data/raw/{}.csv'.format(split))
7
  df.columns = ['Unnamed: 0', 'input_text', 'output_text']
8
+ df.to_csv('data/processed/{}.csv'.format(split))
 
9
 
10
 
11
  if __name__ == '__main__':
src/models/evaluate_model.py CHANGED
@@ -1,16 +1,22 @@
1
  import dagshub
 
2
 
3
  from src.models.model import Summarization
4
  import pandas as pd
5
 
 
6
  def evaluate_model():
7
  """
8
  Evaluate model using rouge measure
9
  """
10
- test_df = pd.load_csv('../../data/processed/test.csv')
 
 
 
11
  model = Summarization()
12
- model.load_model()
13
- results = model.evaluate(test_df=test_df,metrics="rouge")
 
14
  with dagshub.dagshub_logger() as logger:
15
  logger.log_metrics(results)
16
  return results
 
1
  import dagshub
2
+ import yaml
3
 
4
  from src.models.model import Summarization
5
  import pandas as pd
6
 
7
+
8
  def evaluate_model():
9
  """
10
  Evaluate model using rouge measure
11
  """
12
+ with open("params.yml") as f:
13
+ params = yaml.safe_load(f)
14
+
15
+ test_df = pd.load_csv('data/processed/test.csv')
16
  model = Summarization()
17
+ model.load_model(model_dir=params['model_dir'])
18
+ results = model.evaluate(test_df=test_df, metrics=params['metric'])
19
+
20
  with dagshub.dagshub_logger() as logger:
21
  logger.log_metrics(results)
22
  return results
src/models/model.py CHANGED
@@ -94,7 +94,8 @@ class PLDataModule(LightningDataModule):
94
  source_max_token_len: int = 512,
95
  target_max_token_len: int = 512,
96
  batch_size: int = 4,
97
- split: float = 0.1
 
98
  ):
99
  """
100
  :param data_df:
@@ -112,6 +113,7 @@ class PLDataModule(LightningDataModule):
112
  self.target_max_token_len = target_max_token_len
113
  self.source_max_token_len = source_max_token_len
114
  self.tokenizer = tokenizer
 
115
 
116
  def setup(self, stage=None):
117
  self.train_dataset = DataModule(
@@ -130,26 +132,26 @@ class PLDataModule(LightningDataModule):
130
  def train_dataloader(self):
131
  """ training dataloader """
132
  return DataLoader(
133
- self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2
134
  )
135
 
136
  def test_dataloader(self):
137
  """ test dataloader """
138
  return DataLoader(
139
- self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2
140
  )
141
 
142
  def val_dataloader(self):
143
  """ validation dataloader """
144
  return DataLoader(
145
- self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2
146
  )
147
 
148
 
149
  class LightningModel(LightningModule):
150
  """ PyTorch Lightning Model class"""
151
 
152
- def __init__(self, tokenizer, model, output: str = "outputs"):
153
  """
154
  initiates a PyTorch Lightning Model
155
  Args:
@@ -236,7 +238,7 @@ class LightningModel(LightningModule):
236
  "weight_decay": 0.0,
237
  },
238
  ]
239
- optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
240
  self.opt = optimizer
241
  return [optimizer]
242
 
@@ -282,6 +284,9 @@ class Summarization:
282
  use_gpu: bool = True,
283
  outputdir: str = "models",
284
  early_stopping_patience_epochs: int = 0, # 0 to disable early stopping feature
 
 
 
285
  ):
286
  """
287
  trains T5/MT5 model on custom dataset
@@ -298,6 +303,8 @@ class Summarization:
298
  early_stopping_patience_epochs (int, optional): monitors val_loss on epoch end and stops training,
299
  if val_loss does not improve after the specied number of epochs. set 0 to disable early stopping.
300
  Defaults to 0 (disabled)
 
 
301
  """
302
  self.target_max_token_len = target_max_token_len
303
  self.data_module = PLDataModule(
@@ -307,10 +314,12 @@ class Summarization:
307
  batch_size=batch_size,
308
  source_max_token_len=source_max_token_len,
309
  target_max_token_len=target_max_token_len,
 
310
  )
311
 
312
  self.T5Model = LightningModel(
313
- tokenizer=self.tokenizer, model=self.model, output=outputdir
 
314
  )
315
 
316
  MLlogger = MLFlowLogger(experiment_name="Summarization",
 
94
  source_max_token_len: int = 512,
95
  target_max_token_len: int = 512,
96
  batch_size: int = 4,
97
+ split: float = 0.1,
98
+ num_workers: int = 2
99
  ):
100
  """
101
  :param data_df:
 
113
  self.target_max_token_len = target_max_token_len
114
  self.source_max_token_len = source_max_token_len
115
  self.tokenizer = tokenizer
116
+ self.num_workers = num_workers
117
 
118
  def setup(self, stage=None):
119
  self.train_dataset = DataModule(
 
132
  def train_dataloader(self):
133
  """ training dataloader """
134
  return DataLoader(
135
+ self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers
136
  )
137
 
138
  def test_dataloader(self):
139
  """ test dataloader """
140
  return DataLoader(
141
+ self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
142
  )
143
 
144
  def val_dataloader(self):
145
  """ validation dataloader """
146
  return DataLoader(
147
+ self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
148
  )
149
 
150
 
151
  class LightningModel(LightningModule):
152
  """ PyTorch Lightning Model class"""
153
 
154
+ def __init__(self, tokenizer, model, learning_rate, adam_epsilon, output: str = "outputs"):
155
  """
156
  initiates a PyTorch Lightning Model
157
  Args:
 
238
  "weight_decay": 0.0,
239
  },
240
  ]
241
+ optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
242
  self.opt = optimizer
243
  return [optimizer]
244
 
 
284
  use_gpu: bool = True,
285
  outputdir: str = "models",
286
  early_stopping_patience_epochs: int = 0, # 0 to disable early stopping feature
287
+ learning_rate: float = 0.0001,
288
+ adam_epsilon: float = 0.01,
289
+ num_workers: int = 2
290
  ):
291
  """
292
  trains T5/MT5 model on custom dataset
 
303
  early_stopping_patience_epochs (int, optional): monitors val_loss on epoch end and stops training,
304
  if val_loss does not improve after the specied number of epochs. set 0 to disable early stopping.
305
  Defaults to 0 (disabled)
306
+ :param learning_rate:
307
+ :param adam_epsilon:
308
  """
309
  self.target_max_token_len = target_max_token_len
310
  self.data_module = PLDataModule(
 
314
  batch_size=batch_size,
315
  source_max_token_len=source_max_token_len,
316
  target_max_token_len=target_max_token_len,
317
+ num_workers=num_workers,
318
  )
319
 
320
  self.T5Model = LightningModel(
321
+ tokenizer=self.tokenizer, model=self.model, output=outputdir,
322
+ learning_rate=learning_rate,adam_epsilon=adam_epsilon
323
  )
324
 
325
  MLlogger = MLFlowLogger(experiment_name="Summarization",
src/models/predict_model.py CHANGED
@@ -1,5 +1,6 @@
1
  from src.data.make_dataset import make_dataset
2
  from .model import Summarization
 
3
 
4
  def predict_model(text):
5
  """
@@ -12,6 +13,6 @@ def predict_model(text):
12
 
13
 
14
  if __name__ == '__main__':
15
- text = make_dataset(split="test")['input_text'][0]
16
  pre_summary = predict_model(text)
17
  print(pre_summary)
 
1
  from src.data.make_dataset import make_dataset
2
  from .model import Summarization
3
+ import pandas as pd
4
 
5
  def predict_model(text):
6
  """
 
13
 
14
 
15
  if __name__ == '__main__':
16
+ text = pd.load_csv('data/processed/test.csv')['input_text'][0]
17
  pre_summary = predict_model(text)
18
  print(pre_summary)
src/models/train_model.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from src.models.model import Summarization
2
  import pandas as pd
3
 
@@ -6,14 +8,22 @@ def train_model():
6
  """
7
  Train the model
8
  """
 
 
 
9
  # Load the data
10
- train_df = pd.read_csv('../../data/processed/train.csv')
11
- eval_df = pd.read_csv('../../data/processed/validation.csv')
12
 
13
  model = Summarization()
14
- model.from_pretrained('t5','t5-base')
15
- model.train(train_df=train_df, eval_df=eval_df, batch_size=4, max_epochs=3, use_gpu=True)
16
- model.save_model()
 
 
 
 
 
17
 
18
 
19
  if __name__ == '__main__':
 
1
+ import yaml
2
+
3
  from src.models.model import Summarization
4
  import pandas as pd
5
 
 
8
  """
9
  Train the model
10
  """
11
+ with open("params.yml") as f:
12
+ params = yaml.safe_load(f)
13
+
14
  # Load the data
15
+ train_df = pd.read_csv('data/processed/train.csv')
16
+ eval_df = pd.read_csv('data/processed/validation.csv')
17
 
18
  model = Summarization()
19
+ model.from_pretrained(model_type=params['model_type'], model_name=params['model_name'])
20
+
21
+ model.train(train_df=train_df, eval_df=eval_df,
22
+ batch_size=params['batch_size'], max_epochs=params['max_epoch'],
23
+ use_gpu=params['use_gpu'], learning_rate=params['learning_rate'],
24
+ num_workers=params['num_workers'])
25
+
26
+ model.save_model(model_dir=params['model_dir'])
27
 
28
 
29
  if __name__ == '__main__':