hakim commited on
Commit
cb7cafb
·
1 Parent(s): 4de2404

data transformation added

Browse files
config/config.yaml CHANGED
@@ -11,4 +11,10 @@ data_ingestion:
11
  data_validation:
12
  root_dir: artifacts/data_validation
13
  STATUS_FILE: artifacts/data_validation/status.txt
14
- ALL_REQUIRED_FILES: ["train", "test", "validation"]
 
 
 
 
 
 
 
11
  data_validation:
12
  root_dir: artifacts/data_validation
13
  STATUS_FILE: artifacts/data_validation/status.txt
14
+ ALL_REQUIRED_FILES: ["train", "test", "validation"]
15
+
16
+
17
+ data_transformation:
18
+ root_dir: artifacts/data_transformation
19
+ data_path: artifacts/data_ingestion/samsum_dataset
20
+ tokenizer_name: google/pegasus-cnn_dailymail
main.py CHANGED
@@ -1,5 +1,6 @@
1
  from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
2
  from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
 
3
  from textsummarizer.logging import logger
4
 
5
  STAGE_NAME = "Data Ingestion stage"
@@ -16,8 +17,19 @@ except Exception as e:
16
  STAGE_NAME = "Data Validation stage"
17
  try:
18
  logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
19
- data_ingestion = DataValidationPipeline()
20
- data_ingestion.main()
 
 
 
 
 
 
 
 
 
 
 
21
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
22
  except Exception as e:
23
  logger.exception(e)
 
1
  from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
2
  from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
3
+ from textsummarizer.pipeline.stage_03_data_transformation import DataTransformationPipeline
4
  from textsummarizer.logging import logger
5
 
6
  STAGE_NAME = "Data Ingestion stage"
 
17
  STAGE_NAME = "Data Validation stage"
18
  try:
19
  logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
20
+ data_validation = DataValidationPipeline()
21
+ data_validation.main()
22
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
23
+ except Exception as e:
24
+ logger.exception(e)
25
+ raise e
26
+
27
+
28
+ STAGE_NAME = "Data Trnasformation stage"
29
+ try:
30
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
31
+ data_transformaion = DataTransformationPipeline()
32
+ data_transformaion.main()
33
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
34
  except Exception as e:
35
  logger.exception(e)
research/data_transformation.ipynb ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "'c:\\\\mlops projects\\\\text-summarization'"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "%pwd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from dataclasses import dataclass\n",
40
+ "from pathlib import Path\n",
41
+ "@dataclass(frozen=True)\n",
42
+ "class DataTransformationConfig:\n",
43
+ " root_dir : Path\n",
44
+ " data_path : Path\n",
45
+ " tokenizer_name : Path"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 4,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "from textsummarizer.constants import *\n",
55
+ "from textsummarizer.utils.common import read_yaml, create_directories\n",
56
+ "\n",
57
+ "\n",
58
+ "class ConfigurationManager:\n",
59
+ " def __init__(\n",
60
+ " self,\n",
61
+ " config_filepath = CONFIG_FILE_PATH,\n",
62
+ " params_filepath = PARAMS_FILE_PATH):\n",
63
+ "\n",
64
+ " self.config = read_yaml(config_filepath)\n",
65
+ " self.params = read_yaml(params_filepath)\n",
66
+ "\n",
67
+ " create_directories([self.config.artifacts_root])\n",
68
+ "\n",
69
+ "\n",
70
+ " \n",
71
+ " def get_data_transformation_config(self) -> DataTransformationConfig:\n",
72
+ " config = self.config.data_transformation\n",
73
+ "\n",
74
+ " create_directories([config.root_dir])\n",
75
+ "\n",
76
+ " data_transformation_config = DataTransformationConfig(\n",
77
+ " root_dir=config.root_dir,\n",
78
+ " data_path=config.data_path,\n",
79
+ " tokenizer_name = config.tokenizer_name\n",
80
+ " )\n",
81
+ "\n",
82
+ " return data_transformation_config\n",
83
+ "\n",
84
+ " "
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 5,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "name": "stdout",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "[2024-08-11 18:13:05,488: INFO: config: PyTorch version 2.2.2+cu121 available.]\n",
97
+ "[2024-08-11 18:13:05,490: INFO: config: TensorFlow version 2.12.0 available.]\n"
98
+ ]
99
+ }
100
+ ],
101
+ "source": [
102
+ "import os\n",
103
+ "from textsummarizer.logging import logger\n",
104
+ "from transformers import AutoTokenizer\n",
105
+ "from datasets import load_dataset, load_from_disk"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 6,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "class DataTransformation:\n",
115
+ " def __init__(self, config : DataTransformationConfig):\n",
116
+ " self.config = config\n",
117
+ " self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)\n",
118
+ " \n",
119
+ " \n",
120
+ " def convert_examples_to_features(self, example_batch):\n",
121
+ " input_encoding = self.tokenizer(example_batch['dialogue'], max_lenght = 1024, truncation = True)\n",
122
+ " \n",
123
+ " with self.tokenizer.as_target_tokenizer():\n",
124
+ " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
125
+ " \n",
126
+ " return {\n",
127
+ " 'input_ids' : input_encoding['input_ids'],\n",
128
+ " 'attention_mask': input_encoding['attention_mask'],\n",
129
+ " 'labels': target_encodings['input_ids']\n",
130
+ " }\n",
131
+ " \n",
132
+ " def convert(self):\n",
133
+ " dataset_samsum = load_from_disk(self.config.data_path)\n",
134
+ " dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)\n",
135
+ " dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,\"samsum_dataset\")) "
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 7,
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "name": "stdout",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "[2024-08-11 18:13:05,753: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
148
+ "[2024-08-11 18:13:05,757: INFO: common: yaml file: params.yaml loaded successfully]\n",
149
+ "[2024-08-11 18:13:05,758: INFO: common: created directory at: artifacts]\n",
150
+ "[2024-08-11 18:13:05,760: INFO: common: created directory at: artifacts/data_transformation]\n"
151
+ ]
152
+ },
153
+ {
154
+ "ename": "TypeError",
155
+ "evalue": "DataTransformation() takes no arguments",
156
+ "output_type": "error",
157
+ "traceback": [
158
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
159
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
160
+ "Cell \u001b[1;32mIn[7], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
161
+ "Cell \u001b[1;32mIn[7], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m config \u001b[38;5;241m=\u001b[39m ConfigurationManager()\n\u001b[0;32m 3\u001b[0m data_transformation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_data_transformation_config()\n\u001b[1;32m----> 4\u001b[0m data_transformation \u001b[38;5;241m=\u001b[39m \u001b[43mDataTransformation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_transformation_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
162
+ "\u001b[1;31mTypeError\u001b[0m: DataTransformation() takes no arguments"
163
+ ]
164
+ }
165
+ ],
166
+ "source": [
167
+ "try:\n",
168
+ " config = ConfigurationManager()\n",
169
+ " data_transformation_config = config.get_data_transformation_config()\n",
170
+ " data_transformation = DataTransformation(config=data_transformation_config)\n",
171
+ " data_transformation.convert()\n",
172
+ "except Exception as e:\n",
173
+ " raise e"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": []
182
+ }
183
+ ],
184
+ "metadata": {
185
+ "kernelspec": {
186
+ "display_name": "Python 3",
187
+ "language": "python",
188
+ "name": "python3"
189
+ },
190
+ "language_info": {
191
+ "codemirror_mode": {
192
+ "name": "ipython",
193
+ "version": 3
194
+ },
195
+ "file_extension": ".py",
196
+ "mimetype": "text/x-python",
197
+ "name": "python",
198
+ "nbconvert_exporter": "python",
199
+ "pygments_lexer": "ipython3",
200
+ "version": "3.11.0"
201
+ }
202
+ },
203
+ "nbformat": 4,
204
+ "nbformat_minor": 2
205
+ }
src/textsummarizer/config/configuration.py CHANGED
@@ -1,7 +1,8 @@
1
  from textsummarizer.constants import *
2
  from textsummarizer.utils.common import read_yaml, create_directories
3
  from textsummarizer.entity.config_entity import (DataIngestionConfig,
4
- DataValidationConfig)
 
5
 
6
  class ConfigurationManager:
7
  def __init__(
@@ -43,4 +44,19 @@ class ConfigurationManager:
43
  )
44
 
45
  return data_validation_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
1
  from textsummarizer.constants import *
2
  from textsummarizer.utils.common import read_yaml, create_directories
3
  from textsummarizer.entity.config_entity import (DataIngestionConfig,
4
+ DataValidationConfig,
5
+ DataTransformationConfig)
6
 
7
  class ConfigurationManager:
8
  def __init__(
 
44
  )
45
 
46
  return data_validation_config
47
+
48
+
49
+ def get_data_transformation_config(self) -> DataTransformationConfig:
50
+ config = self.config.data_transformation
51
+
52
+ create_directories([config.root_dir])
53
+
54
+ data_transformation_config = DataTransformationConfig(
55
+ root_dir=config.root_dir,
56
+ data_path=config.data_path,
57
+ tokenizer_name = config.tokenizer_name
58
+ )
59
+
60
+ return data_transformation_config
61
+
62
 
src/textsummarizer/conponents/data_tranformation.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from textsummarizer.logging import logger
3
+ from transformers import AutoTokenizer
4
+ from datasets import load_dataset, load_from_disk
5
+ from textsummarizer.entity.config_entity import DataTransformationConfig
6
+
7
+ class DataTransformation:
8
+ def __init__(self, config : DataTransformationConfig):
9
+ self.config = config
10
+ self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
11
+
12
+
13
+ def convert_examples_to_features(self, example_batch):
14
+ input_encoding = self.tokenizer(example_batch['dialogue'], max_length = 1024, truncation = True)
15
+
16
+ with self.tokenizer.as_target_tokenizer():
17
+ target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )
18
+
19
+ return {
20
+ 'input_ids' : input_encoding['input_ids'],
21
+ 'attention_mask': input_encoding['attention_mask'],
22
+ 'labels': target_encodings['input_ids']
23
+ }
24
+
25
+ def convert(self):
26
+ dataset_samsum = load_from_disk(self.config.data_path)
27
+ dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)
28
+ dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))
src/textsummarizer/entity/config_entity.py CHANGED
@@ -15,3 +15,12 @@ class DataValidationConfig:
15
  root_dir : Path
16
  STATUS_FILE : str
17
  ALL_REQUIRED_FILES : list
 
 
 
 
 
 
 
 
 
 
15
  root_dir : Path
16
  STATUS_FILE : str
17
  ALL_REQUIRED_FILES : list
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class DataTransformationConfig:
22
+ root_dir : Path
23
+ data_path : Path
24
+ tokenizer_name : Path
25
+
26
+
src/textsummarizer/pipeline/stage_03_data_transformation.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.conponents.data_tranformation import DataTransformation
2
+ from textsummarizer.config.configuration import ConfigurationManager
3
+
4
+ class DataTransformationPipeline:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def main(self):
9
+ config = ConfigurationManager()
10
+ data_transformation_config = config.get_data_transformation_config()
11
+ data_transformation = DataTransformation(config=data_transformation_config)
12
+ data_transformation.convert()