Spaces:

hoang-quoc-trung
/

sumen

Sleeping

App Files Files Community

hoang-quoc-trung commited on Apr 16

Commit

3d52ce7

•

1 Parent(s): 76f844e

Upload 8 files

Browse files

Files changed (8) hide show

requirements.txt +195 -0
src/__init__.py +4 -0
src/utils/__init__.py +4 -0
src/utils/__pycache__/__init__.cpython-38.pyc +0 -0
src/utils/__pycache__/common_utils.cpython-38.pyc +0 -0
src/utils/__pycache__/metrics.cpython-38.pyc +0 -0
src/utils/common_utils.py +130 -0
src/utils/metrics.py +69 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,195 @@

+absl-py==2.0.0
+accelerate==0.23.0
+aiofiles==23.2.1
+aiohttp==3.8.6
+aiosignal==1.3.1
+albumentations==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyconfig==0.13.0
+anyio==4.3.0
+appdirs==1.4.4
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==23.1.0
+backcall==0.2.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.7.22
+charset-normalizer==3.3.2
+click==8.1.7
+cmake==3.27.7
+comm==0.2.0
+contourpy==1.1.1
+cycler==0.12.1
+datasets==2.14.7
+debugpy==1.8.0
+decorator==5.1.1
+dill==0.3.7
+docker-pycreds==0.4.0
+evaluate==0.4.1
+exceptiongroup==1.2.0
+executing==2.0.1
+fastapi==0.110.0
+ffmpy==0.3.2
+filelock==3.13.1
+fonttools==4.49.0
+frozenlist==1.4.0
+fsspec==2023.10.0
+gitdb==4.0.11
+GitPython==3.1.40
+google-auth==2.23.4
+google-auth-oauthlib==1.0.0
+gradio==3.50.2
+gradio_client==0.6.1
+grpcio==1.59.2
+h11==0.14.0
+httpcore==1.0.4
+httpx==0.27.0
+huggingface-hub==0.20.3
+idna==3.4
+imageio==2.32.0
+importlib-metadata==6.8.0
+importlib_resources==6.1.2
+ipykernel==6.25.2
+ipython==8.12.3
+jedi==0.19.1
+Jinja2==3.1.2
+jiwer==3.0.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+jupyter_client==8.6.0
+jupyter_core==5.5.0
+kiwisolver==1.4.5
+lazy_loader==0.3
+Levenshtein==0.23.0
+lightning==2.1.1
+lightning-utilities==0.9.0
+lion-pytorch==0.1.2
+lit==17.0.5
+Markdown==3.5.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.7.5
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+munch==4.0.0
+natsort==8.4.0
+nest-asyncio==1.5.8
+networkx==3.1
+nltk==3.8.1
+nougat-ocr==0.1.17
+numpy==1.22.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+oauthlib==3.2.2
+opencv-python-headless==4.8.1.78
+orjson==3.9.10
+packaging==23.2
+pandas==2.0.3
+parso==0.8.3
+peft==0.8.2
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==10.0.1
+pip==23.3.1
+pkgutil_resolve_name==1.3.10
+platformdirs==4.0.0
+prompt-toolkit==3.0.41
+protobuf==4.25.0
+psutil==5.9.6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1
+pyarrow-hotfix==0.5
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pydantic==2.6.2
+pydantic_core==2.16.3
+pydeck==0.8.1b0
+pydub==0.25.1
+Pygments==2.16.1
+pyparsing==3.1.1
+pypdf==3.17.1
+pypdfium2==4.24.0
+python-dateutil==2.8.2
+python-Levenshtein==0.23.0
+python-multipart==0.0.9
+pytorch-lightning==2.1.1
+pytz==2023.3.post1
+PyWavelets==1.4.1
+PyYAML==6.0.1
+pyzmq==25.1.1
+qudida==0.0.4
+rapidfuzz==3.5.2
+referencing==0.33.0
+regex==2023.10.3
+requests==2.31.0
+requests-oauthlib==1.3.1
+responses==0.18.0
+rich==13.7.1
+rpds-py==0.18.0
+rsa==4.9
+ruamel.yaml==0.18.5
+ruamel.yaml.clib==0.2.8
+safetensors==0.4.0
+scikit-image==0.21.0
+scikit-learn==1.3.2
+scipy==1.10.1
+sconf==0.2.5
+semantic-version==2.10.0
+sentencepiece==0.1.99
+sentry-sdk==1.37.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.36.3
+streamlit==1.33.0
+sympy==1.12
+tenacity==8.2.3
+tensorboard==2.14.0
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.2.2
+threadpoolctl==3.2.0
+tifffile==2023.7.10
+timm==0.5.4
+tokenizers==0.15.1
+toml==0.10.2
+toolz==0.12.1
+torch==2.0.0
+torchmetrics==1.2.0
+torchvision==0.15.1
+tornado==6.3.3
+tqdm==4.66.1
+traitlets==5.13.0
+transformers==4.37.0
+triton==2.0.0
+typing_extensions==4.8.0
+tzdata==2023.3
+urllib3==2.1.0
+uvicorn==0.27.1
+wandb==0.16.0
+watchdog==4.0.0
+wcwidth==0.2.10
+websockets==11.0.3
+Werkzeug==3.0.1
+wheel==0.41.3
+xxhash==3.4.1
+yarl==1.9.2
+zipp==3.17.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+import sys
+module = os.path.join(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(module)

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+import sys
+module = os.path.join(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(module)

src/utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (331 Bytes). View file

src/utils/__pycache__/common_utils.cpython-38.pyc ADDED Viewed

Binary file (3.09 kB). View file

src/utils/__pycache__/metrics.cpython-38.pyc ADDED Viewed

Binary file (1.79 kB). View file

src/utils/common_utils.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import csv
+import torch
+import numpy
+def check_device(logger=None):
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        logger.info("There are {} GPU(s) available.".format(torch.cuda.device_count()))
+        logger.info('We will use the GPU: {}'.format(torch.cuda.get_device_name(0)))
+    else:
+        logger.info('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
+    return device
+def print_trainable_parameters(model, logger):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    logger.info(
+        "Total params: {}M ({}) || Trainable params: {} || Trainable: {}%".format(
+            round(all_param/1000000),
+            all_param,
+            trainable_params,
+            100 * trainable_params / all_param
+        )
+    )
+def save_log(
+    loss: float,
+    bleu: float,
+    edit_distance: float,
+    exact_match: float,
+    wer: float,
+    exprate: float,
+    exprate_error_1: float,
+    exprate_error_2: float,
+    exprate_error_3: float,
+    file_name="test_log.csv",
+):
+    os.makedirs('log', exist_ok=True)
+    file_path = os.path.join('log', file_name)
+    with open(file_path, mode="a", newline="") as csv_file:
+        fieldnames = [
+            "loss",
+            "bleu",
+            "edit_distance",
+            "exact_match",
+            "wer",
+            "exprate",
+            "exprate_error_1",
+            "exprate_error_2",
+            "exprate_error_3"
+        ]
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        # write the header row
+        if csv_file.tell() == 0:
+            writer.writeheader()
+        # write the data row
+        writer.writerow(
+            {
+                "loss": loss,
+                "bleu": bleu,
+                "edit_distance": edit_distance,
+                "exact_match": exact_match,
+                "wer": wer,
+                "exprate": exprate,
+                "exprate_error_1": exprate_error_1,
+                "exprate_error_2": exprate_error_2,
+                "exprate_error_3": exprate_error_3,
+            }
+        )
+def cmp_result(label,rec):
+    dist_mat = numpy.zeros((len(label)+1, len(rec)+1),dtype='int32')
+    dist_mat[0,:] = range(len(rec) + 1)
+    dist_mat[:,0] = range(len(label) + 1)
+    for i in range(1, len(label) + 1):
+        for j in range(1, len(rec) + 1):
+            hit_score = dist_mat[i-1, j-1] + (label[i-1] != rec[j-1])
+            ins_score = dist_mat[i,j-1] + 1
+            del_score = dist_mat[i-1, j] + 1
+            dist_mat[i,j] = min(hit_score, ins_score, del_score)
+    dist = dist_mat[len(label), len(rec)]
+    return dist, len(label)
+def compute_exprate(predictions, references):
+    total_label = 0
+    total_line = 0
+    total_line_rec = 0
+    total_line_error_1 = 0
+    total_line_error_2 = 0
+    total_line_error_3 = 0
+    for i in range(len(references)):
+        pre = predictions[i].split()
+        ref = references[i].split()
+        dist, llen = cmp_result(pre, ref)
+        total_label += llen
+        total_line += 1
+        if dist == 0:
+            total_line_rec += 1
+        elif dist ==1:
+            total_line_error_1 +=1
+        elif dist ==2:
+            total_line_error_2 +=1
+        elif dist ==3:
+            total_line_error_3 +=1
+    exprate = float(total_line_rec)/total_line
+    error_1 = float(
+        total_line_error_1 + total_line_rec
+    )/total_line
+    error_2 = float(
+        total_line_error_2 + total_line_error_1 +total_line_rec
+    )/total_line
+    error_3 = float(
+        total_line_error_3 + total_line_error_2 + total_line_error_1 + total_line_rec
+    )/total_line
+    return exprate, error_1, error_2, error_3

src/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import nltk
+import evaluate
+from nltk import edit_distance as compute_edit_distance
+from src.utils.common_utils import compute_exprate
+class Metrics:
+    def __init__(self, processor):
+        self.processor = processor
+        self.bleu = evaluate.load("bleu")
+        self.wer = evaluate.load("wer")
+        self.exact_match = evaluate.load("exact_match")
+    def compute_metrics(self, pred):
+        labels_ids = pred.label_ids
+        pred_ids = pred.predictions
+        pred_str = self.processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+        labels_ids[labels_ids == -100] = self.processor.tokenizer.pad_token_id
+        label_str = self.processor.tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
+        total_edit_distance, total_bleu, total_exact_match = 0, 0, 0
+        for i in range(len(pred_str)):
+            # Compute edit distance score
+            edit_distance = compute_edit_distance(
+                pred_str[i],
+                label_str[i]
+            )/max(len(pred_str[i]),len(label_str[i]))
+            total_edit_distance = total_edit_distance + edit_distance
+            # Compute bleu score
+            try:
+                bleu = self.bleu.compute(
+                    predictions=[pred_str[i]],
+                    references=[label_str[i]],
+                    max_order=4 # Maximum n-gram order to use when computing BLEU score
+                )
+                total_bleu += bleu['bleu']
+            except ZeroDivisionError:
+                total_bleu+=0
+            # Compute exact match score
+            exact_match = self.exact_match.compute(
+                predictions=[pred_str[i]],
+                references=[label_str[i]],
+                regexes_to_ignore=[' ']
+            )
+            total_exact_match += exact_match['exact_match']
+        bleu = total_bleu / len(pred_str)
+        exact_match = total_exact_match / len(pred_str)
+        # Convert minimun edit distance score to maximun edit distance score
+        edit_distance = 1 - (total_edit_distance / len(pred_str))
+        # Compute word error rate score
+        wer = self.wer.compute(predictions=pred_str, references=label_str)
+        # Compute expression rate score
+        exprate, error_1, error_2, error_3 = compute_exprate(
+            predictions=pred_str,
+            references=label_str
+        )
+        return {
+            "bleu": round(bleu*100, 2),
+            "maximun_edit_distance": round(edit_distance*100, 2),
+            "exact_match": round(exact_match*100, 2),
+            "wer": round(wer*100, 2),
+            "exprate": round(exprate*100, 2),
+            "exprate_error_1": round(error_1*100, 2),
+            "exprate_error_2": round(error_2*100, 2),
+            "exprate_error_3": round(error_3*100, 2),
+        }