gagan3012 commited on
Commit
2466d7f
1 Parent(s): 74b874e
Files changed (3) hide show
  1. .gitignore +1 -0
  2. data.dvc +4 -4
  3. src/data/process_data.py +3 -3
.gitignore CHANGED
@@ -91,3 +91,4 @@ coverage.xml
91
 
92
  .idea
93
  .vscode
 
 
91
 
92
  .idea
93
  .vscode
94
+ /data
data.dvc CHANGED
@@ -1,5 +1,5 @@
1
  outs:
2
- - md5: 2ab20ac1b58df875a590b07d0e04eb5b.dir
3
- path: data/raw
4
- size: 1359144987
5
- nfiles: 3
 
1
  outs:
2
+ - md5: 4088e0a288132d141c28bd020548d107.dir
3
+ path: data
4
+ size: 2720315628
5
+ nfiles: 6
src/data/process_data.py CHANGED
@@ -3,12 +3,12 @@ import pandas as pd
3
 
4
  def process_data(split='train'):
5
  df = pd.read_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/raw/{}.csv'.format(split))
6
- df.rename(columns={"article": "input_text", "highlights": "output_text"})
7
- print(df.shape())
8
  df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split))
9
 
10
 
11
- if __name__ == '__name__':
12
  process_data(split='train')
13
  process_data(split='test')
14
  process_data(split='validation')
 
3
 
4
  def process_data(split='train'):
5
  df = pd.read_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/raw/{}.csv'.format(split))
6
+ df.columns = ['Unnamed: 0', 'input_text', 'output_text']
7
+ print(df.columns)
8
  df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split))
9
 
10
 
11
+ if __name__ == '__main__':
12
  process_data(split='train')
13
  process_data(split='test')
14
  process_data(split='validation')