gagan3012 commited on
Commit
62828bf
1 Parent(s): 0d07fc7

Pipeline updates

Browse files
Files changed (4) hide show
  1. .dvc/config +2 -0
  2. dvc.yaml +0 -1
  3. src/data/make_dataset.py +8 -6
  4. src/models/model.py +3 -3
.dvc/config CHANGED
@@ -0,0 +1,2 @@
 
 
1
+ ['remote "origin"']
2
+ url = https://dagshub.com/gagan3012/summarization.dvc
dvc.yaml CHANGED
@@ -22,7 +22,6 @@ stages:
22
  process_data:
23
  cmd: python src/data/make_dataset.py
24
  deps:
25
- - data/raw
26
  - src/data/make_dataset.py
27
  outs:
28
  - data/processed:
22
  process_data:
23
  cmd: python src/data/make_dataset.py
24
  deps:
 
25
  - src/data/make_dataset.py
26
  outs:
27
  - data/processed:
src/data/make_dataset.py CHANGED
@@ -2,14 +2,16 @@ from datasets import load_dataset
2
  import pandas as pd
3
 
4
 
5
- def make_dataset(dataset='cnn_dailymail', split='train', version="3.0.0"):
6
  """make dataset for summarisation"""
7
- dataset = load_dataset(dataset, split=split, script_version=version)
8
  df = pd.DataFrame()
9
- df['input_text'] = dataset['concepts']
10
- df['output_text'] = dataset['target']
11
- return df
12
 
13
 
14
  if __name__ == '__main__':
15
- make_dataset(dataset='cnn_dailymail', split='train', version="3.0.0")
 
 
2
  import pandas as pd
3
 
4
 
5
+ def make_dataset(dataset='cnn_dailymail', split='train'):
6
  """make dataset for summarisation"""
7
+ dataset = load_dataset(dataset, '3.0.0', split=split)
8
  df = pd.DataFrame()
9
+ df['input_text'] = dataset['article']
10
+ df['output_text'] = dataset['highlights']
11
+ df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split, split))
12
 
13
 
14
  if __name__ == '__main__':
15
+ make_dataset(dataset='cnn_dailymail', split='train')
16
+ make_dataset(dataset='cnn_dailymail', split='test')
17
+ make_dataset(dataset='cnn_dailymail', split='validation')
src/models/model.py CHANGED
@@ -303,9 +303,9 @@ class Summarization:
303
  tokenizer=self.tokenizer, model=self.model, output=outputdir
304
  )
305
 
306
- # logger = MLFlowLogger(experiment_name="Summarization",tracking_uri="https://dagshub.com/gagan3012/summarization.mlflow")
307
 
308
- logger = DAGsHubLogger()
309
 
310
  early_stop_callback = (
311
  [
@@ -324,7 +324,7 @@ class Summarization:
324
  gpus = 1 if use_gpu else 0
325
 
326
  trainer = Trainer(
327
- logger=logger,
328
  callbacks=early_stop_callback,
329
  max_epochs=max_epochs,
330
  gpus=gpus,
303
  tokenizer=self.tokenizer, model=self.model, output=outputdir
304
  )
305
 
306
+ MLlogger = MLFlowLogger(experiment_name="Summarization",tracking_uri="https://dagshub.com/gagan3012/summarization.mlflow")
307
 
308
+ logger = DAGsHubLogger(metrics_path='reports/metrics.txt')
309
 
310
  early_stop_callback = (
311
  [
324
  gpus = 1 if use_gpu else 0
325
 
326
  trainer = Trainer(
327
+ logger=[logger,MLlogger],
328
  callbacks=early_stop_callback,
329
  max_epochs=max_epochs,
330
  gpus=gpus,