Upload 8 files

Browse files

Files changed (8) hide show

Makefile +26 -0
README.md +60 -0
build_dataset.py +81 -0
config.yaml +11 -0
environment.yaml +16 -0
fine-tuning.py +73 -0
inference.py +57 -0
utils.py +57 -0

Makefile ADDED Viewed

	@@ -0,0 +1,26 @@

+.PHONY: data train eval inference run clean
+data:
+	@echo "Creating dataset from google/sentence_compressiom.."
+	python -m build_dataset
+train:
+	@echo "Training google/t5-small model for sentence compression.."
+	python -m fine-tuning
+eval:
+	@echo "Evaluation on test set.."
+	python -m utils
+inference:
+	@echo "Performing model inference on evaluation data.."
+	python -m inference
+run: clean data train eval inference
+clean:
+	@find . -name "*.pyc" -exec rm {} \;
+	@rm -rf dataset/preprocessed/* checkpoints/* results/*;
+zip:
+	@tar --exclude=".[^/]*" -czvf "AnshuKumar-RingCentral-$(shell date +"%Y%m%d").tar.gz" *

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+## Getting Started
+### Installation
+1. conda environment
+```
+conda env create --name NAME --file=environment.yaml
+```
+The Project is designed around several scripts that simulate a typical machine learning workflow. Starting with data preparation after preparing data, training model, evaluation and inference model. `google/t5-small` model was being trained on above dataset for `10` epochs. Later inference ran on evaluation data, performance metrics and evaluation results were stored inside `result` subdirectory of `project` directory.
+I added Makefile which can be used to run python scripts separately using following bash commands.
+```bash
+make data
+make train
+make eval
+make inference
+```
+`run` is a bash command which can aggregately run entire project.
+```bash
+make run
+```
+`clean` is a bash command which can be used to clean the previous runs.
+```bash
+make clean
+```
+Performance metrics stores into `performance.json` file inside `results` directory.
+```json
+{
+    "rouge1": 0.79689240266461,
+    "rouge2": 0.7606140631154827,
+    "rougeL": 0.7733855633904199,
+    "rougeLsum": 0.7734703253159519
+}
+```
+And also, `eval_results.csv` containing predictions of evaluation file.
+| original  | compressed | predictions |
+|-----------|------------|-------------|
+| sentence1 | compress1  | prediction1 |
+| sentence2 | compress2  | prediction2 |
+| :         | :          | :           |
+### References:
+1. https://github.com/google-research-datasets/sentence-compression
+2. https://huggingface.co/docs/transformers/en/tasks/summarization
+### Note:
+Download trained checkpoint from given drive link [checkpoint](https://drive.google.com/drive/folders/1yrl0VtmM9BtT4aU2Z5vLs6doz35MMxvM?usp=drive_link)

build_dataset.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import shutil
+import glob
+import json
+import csv
+import yaml
+from git import Repo
+import gzip
+fieldnames = ['original','compressed']
+def to_csv_record(writer, buffer):
+  record = json.loads(buffer)
+  writer.writerow(dict(
+    original=record['graph']['sentence'],
+    compressed=record['compression']['text']))
+def build_dataset(rawdata_dir, preprocessed_data_dir):
+    print("Data Preparation...")
+    os.makedirs(preprocessed_data_dir, exist_ok=True)
+    with open(os.path.join(preprocessed_data_dir, 'training_data.csv'),'w') as csvfile:
+      writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+      writer.writeheader()
+      for rawdata_files in glob.glob(f'{rawdata_dir}/data/**train**.json'):
+        with open(rawdata_files) as raw_contents:
+          buffer = ''
+          for line in raw_contents:
+            if line.strip()=='':
+                to_csv_record(writer, buffer)
+                buffer = ''
+            else:
+              buffer += line
+          if len(buffer)>0:
+            to_csv_record(writer, buffer)
+    with open(os.path.join(preprocessed_data_dir, 'eval_data.csv'),'w') as csvfile:
+      writer = csv.DictWriter(csvfile, fieldnames=['original','compressed'])
+      writer.writeheader()
+      with open(f'{rawdata_dir}/data/comp-data.eval.json') as raw_contents:
+        buffer = ''
+        for line in raw_contents:
+          if line.strip()=='':
+              to_csv_record(writer, buffer)
+              buffer = ''
+          else: buffer += line
+        if len(buffer)>0: to_csv_record(writer, buffer)
+def decompressing_rawdata(rawdata_dir):
+    print("Decompression...")
+    compressed_files = glob.glob(rawdata_dir + "/data/*.json.gz")
+    for compressed_file_path in compressed_files:
+        output_file_path = os.path.splitext(compressed_file_path)[0]
+        with gzip.open(compressed_file_path, 'rb') as comp_file:
+            compressed_content = comp_file.read()
+        with open(output_file_path, 'wb') as output_file:
+            output_file.write(compressed_content)
+        os.remove(compressed_file_path)
+def download_rawdata(git_url, rawdata_dir):
+    os.makedirs(rawdata_dir, exist_ok=True)
+    print("Data Cloning...")
+    current_dir = os.getcwd()
+    try:
+        os.chdir(rawdata_dir)
+        Repo.clone_from(git_url, '.')
+    except Exception as e:
+        print("Error:", e)
+    finally:
+        os.chdir(current_dir)
+if __name__ == "__main__":
+    config = yaml.safe_load(open("config.yaml", "r"))
+    PROJECT_DIR = eval(config["SENTENCE_COMPRESSION"]["PROJECT_DIR"])
+    rawdata_git = config["SENTENCE_COMPRESSION"]["DATA"]["RAW_DATA"]
+    preprocessed_data_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["DATA"]["CLEAN_DATA"])
+    rawdata_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["DATA"]["RAW_DIR"])
+    download_rawdata(rawdata_git, rawdata_dir)
+    decompressing_rawdata(rawdata_dir)
+    build_dataset(rawdata_dir, preprocessed_data_dir)
+    shutil.rmtree(rawdata_dir)

config.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+SENTENCE_COMPRESSION:
+  PROJECT_DIR: os.getcwd()
+  DATA:
+    RAW_DATA: https://github.com/google-research-datasets/sentence-compression.git
+    RAW_DIR: dataset/rawdata
+    CLEAN_DATA: dataset/preprocessed
+  TRAINING:
+  INFERENCE:
+    MODEL_PATH: checkpoints
+  OUTPUT:
+    RESULT: results

environment.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: compression
+dependencies:
+  - python=3.10.10
+  - pip=23.3.1
+  - pip:
+    - transformers==4.37.2
+    - seaborn==0.13.2
+    - scikit-learn==1.4.0
+    - pandas==2.2.1
+    - GitPython==3.1.43
+    - torch==2.2.2
+    - evaluate==0.4.2
+    - accelerate==0.27.0
+    - absl-py==2.1.0
+    - nltk==3.8.1
+    - rouge_score==0.1.2

fine-tuning.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import yaml
+import pandas as pd
+from transformers import AutoTokenizer
+from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer
+from sklearn.model_selection import train_test_split
+from transformers import DataCollatorForSeq2Seq
+import evaluate
+import numpy as np
+checkpoint = "google-t5/t5-small"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+prefix = "summarize the following sentence: "
+def preprocess_function(examples):
+    inputs = prefix + examples["original"]
+    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+    labels = tokenizer(text_target=examples["compressed"], max_length=128, truncation=True)
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    rouge = evaluate.load("rouge")
+    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+    result["gen_len"] = np.mean(prediction_lens)
+    return {k: round(v, 4) for k, v in result.items()}
+def main():
+    print("Data Loading...")
+    config = yaml.safe_load(open("config.yaml", "r"))
+    PROJECT_DIR = eval(config["SENTENCE_COMPRESSION"]["PROJECT_DIR"])
+    data_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["DATA"]["CLEAN_DATA"])
+    data = pd.read_csv(os.path.join(data_dir, 'training_data.csv'))
+    print("Tokenization started...")
+    data_preprocessed = data.apply(preprocess_function, axis=1)
+    print("Test data preprocessing...")
+    train_tokenized, test_tokenized = train_test_split(data_preprocessed, test_size=0.2)
+    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+    print("Model Loading...")
+    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+    training_args = Seq2SeqTrainingArguments(
+        output_dir="checkpoints",
+        evaluation_strategy="epoch",
+        learning_rate=2e-5,
+        per_device_train_batch_size=16,
+        per_device_eval_batch_size=4,
+        weight_decay=0.01,
+        save_total_limit=3,
+        num_train_epochs=10,
+        predict_with_generate=True,
+        fp16=True,
+        push_to_hub=False,
+    )
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_tokenized.values,
+        eval_dataset=test_tokenized.values,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+    trainer.train()
+if __name__ == "__main__":
+    main()

inference.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import yaml
+import json
+import pandas as pd
+import evaluate
+from transformers import pipeline
+def load_pipeline(model_path):
+    summarizer = pipeline("summarization", model=model_path, device=0)
+    return summarizer
+def infernece(pipeline, eval_data):
+    prompt = "summarize the following sentence:"
+    sentences = eval_data['original'].tolist()
+    compressed = eval_data['compressed'].tolist()
+    predictions = []
+    for sent in sentences:
+        text = prompt + sent
+        out = pipeline(text)
+        predictions.append(out[0]['summary_text'])
+    return {"original": sentences, "compressed": compressed, "predictions": predictions}
+def compute_performace(eval_data):
+    original_compressed = eval_data['compressed']
+    pred_compressed = eval_data['predictions']
+    rouge = evaluate.load('rouge')
+    predictions = eval_data['predictions']#.tolist()
+    references = eval_data['compressed']#.tolist()
+    # Compute the ROUGE score
+    results = rouge.compute(predictions=predictions, references=references)
+    print(results)
+    return results
+def get_latest_checkpoint(checkpoint_dir):
+    subdirs = [name for name in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, name)) and name.startswith("checkpoint-")]
+    checkpoint_numbers = [int(subdir.split("-")[1]) for subdir in subdirs]
+    latest_checkpoint = "checkpoint-" + str(max(checkpoint_numbers))
+    return latest_checkpoint
+if __name__ == "__main__":
+    config = yaml.safe_load(open("config.yaml", "r"))
+    PROJECT_DIR = eval(config["SENTENCE_COMPRESSION"]["PROJECT_DIR"])
+    data_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["DATA"]["CLEAN_DATA"])
+    model_checkpoint = config["SENTENCE_COMPRESSION"]["INFERENCE"]["MODEL_PATH"]
+    latest_checkpoint = get_latest_checkpoint(os.path.join(PROJECT_DIR, model_checkpoint))
+    model_path = os.path.join(PROJECT_DIR, model_checkpoint, latest_checkpoint)
+    pipeline = load_pipeline(model_path)
+    eval_data = pd.read_csv(os.path.join(data_dir, 'eval_data.csv'))
+    eval_data_res = infernece(pipeline, eval_data)
+    output_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["OUTPUT"]["RESULT"])
+    os.makedirs(output_dir, exist_ok=True)
+    eval_res_df = pd.DataFrame(eval_data_res)
+    eval_res_df.to_csv(os.path.join(output_dir, "eval_result.csv"), index=False)
+    result = compute_performace(eval_data_res)
+    json.dump(result, open(os.path.join(output_dir, "performance.json"), "w"), indent=4)

utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import json
+import pandas as pd
+import yaml
+import seaborn as sns
+import matplotlib.pyplot as plt
+from inference import get_latest_checkpoint
+def process_loss(loss, final_loss):
+    epoch = int(loss["epoch"])
+    final_loss["epoch"].append(epoch)
+    for key in ["loss", "eval_loss", "eval_rouge1", "eval_rouge2"]:
+        try:
+            value = loss[key]
+            final_loss[key].append(value)
+        except KeyError:
+            pass
+def loss_function(losses):
+    final_loss = {
+        "epoch": [],
+        "loss": [],
+        "eval_loss": [],
+        "eval_rouge1": [],
+        "eval_rouge2": []
+    }
+    for loss_steps in losses:
+        if float(loss_steps.get("epoch", 0)) % 1 == 0:
+            process_loss(loss_steps, final_loss)
+    final_loss["epoch"] = list(set(final_loss["epoch"]))
+    return final_loss
+def plot_loss(data, output_dir):
+    df = pd.DataFrame(data)
+    df_melted = pd.melt(df, id_vars=['epoch'], var_name='metric', value_name='value')
+    plt.figure(figsize=(10, 6))
+    sns.lineplot(data=df_melted, x='epoch', y='value', hue='metric', marker='o')
+    plt.legend(title='Metric')
+    plt.xlabel('Epoch')
+    plt.ylabel('Value')
+    plt.title('Metrics vs Epoch')
+    plt.savefig(os.path.join(output_dir, 'metrics_vs_epoch.png'))
+if __name__ == "__main__":
+    config = yaml.safe_load(open("config.yaml", "r"))
+    PROJECT_DIR = eval(config["SENTENCE_COMPRESSION"]["PROJECT_DIR"])
+    checkpoint_dir = config["SENTENCE_COMPRESSION"]["INFERENCE"]["MODEL_PATH"]
+    latest_checkpoint = get_latest_checkpoint(os.path.join(PROJECT_DIR, checkpoint_dir))
+    logfile_dir = os.path.join(PROJECT_DIR, checkpoint_dir, latest_checkpoint)
+    logfile_path = os.path.join(logfile_dir, "trainer_state.json")
+    logs = json.load(open(logfile_path))
+    final_loss = loss_function(logs["log_history"])
+    output_dir =  config["SENTENCE_COMPRESSION"]["OUTPUT"]["RESULT"]
+    os.makedirs(output_dir, exist_ok=True)
+    plot_loss(final_loss, output_dir)