Spaces:

jorgefio
/

Case-Classification

Sleeping

App Files Files Community

Jorge Fioranelli commited on Jul 19, 2023

Commit

6f0a968

•

1 Parent(s): 839e6e2

Added zero-shot and distilbert models

Browse files

Files changed (18) hide show

.gitignore +162 -0
all-models.py +33 -0
app.py +0 -29
data/internet_provider.csv +0 -0
distillbert-classification-finetuning.py +54 -0
distillbert-classification-run.py +21 -0
flagged/DistilBERT/tmpxkobdgs7.json +1 -0
flagged/Zero-Shot-Classification/tmp0kb5h3dj.json +1 -0
flagged/log.csv +2 -0
models/distilbert-base-uncased-finetuned-internet-provider/config.json +35 -0
models/distilbert-base-uncased-finetuned-internet-provider/special_tokens_map.json +7 -0
models/{sketch_recognition/cnn.h5 → distilbert-base-uncased-finetuned-internet-provider/tf_model.h5} +2 -2
models/distilbert-base-uncased-finetuned-internet-provider/tf_model.preproc +0 -0
models/distilbert-base-uncased-finetuned-internet-provider/tokenizer.json +0 -0
models/distilbert-base-uncased-finetuned-internet-provider/tokenizer_config.json +13 -0
models/distilbert-base-uncased-finetuned-internet-provider/vocab.txt +0 -0
requirements.txt +5 -1
zero-shot-classification.py +16 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.DS_Store

all-models.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import gradio as gr
+from transformers import pipeline
+import concurrent.futures
+import ktrain
+zero_shot = pipeline("zero-shot-classification")
+distilbert = ktrain.load_predictor("models/distilbert-base-uncased-finetuned-internet-provider")
+def zero_shot_predict(text):
+    labels = ["Slow Connectivity", "Billing", "Setup", "No Connectivity"]
+    preds = zero_shot(text, candidate_labels=labels)
+    return {label: float(pred) for label, pred in zip(preds["labels"], preds["scores"])}
+def distilbert_predict(text):
+    labels = distilbert.get_classes()
+    preds = distilbert.predict_proba(text)
+    return {label: float(pred) for label, pred in zip(labels, preds)}
+def predict(text):
+    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+        zero_shot_future = executor.submit(zero_shot_predict, text)
+        distilbert_future = executor.submit(distilbert_predict, text)
+    concurrent.futures.wait([zero_shot_future, distilbert_future])
+    zero_shot_preds = zero_shot_future.result()
+    distilbert_preds = distilbert_future.result()
+    return zero_shot_preds, distilbert_preds
+input = gr.inputs.Textbox(label="Customer Sentence")
+outputs = [gr.outputs.Label(num_top_classes=4, label="Zero-Shot-Classification"), gr.outputs.Label(num_top_classes=4, label="DistilBERT")]
+title = "Case Classification"
+description = "Comparison of Zero-Shot-Classification and a fine-tuned DistilBERT."
+gr.Interface(predict, input, outputs, live=True, title=title, analytics_enabled=False,
+             description=description, capture_session=True).launch()

app.py DELETED Viewed

@@ -1,29 +0,0 @@
-import gradio as gr
-import numpy as np
-import tensorflow as tf
-import urllib.request
-# mlp_model = tf.keras.models.load_model(
-#   "models/sketch_recognition/mlp.h5")
-cnn_model = tf.keras.models.load_model(
-  "models/sketch_recognition/cnn.h5")
-labels = urllib.request.urlopen("https://raw.githubusercontent.com/googlecreativelab/quickdraw-dataset/master/categories.txt")
-labels = labels.read()
-labels = labels.decode('utf-8').split("\n")[:-1]
-def predict(img):
-  img = tf.math.divide(img, 255)
-  preds = cnn_model.predict(img.numpy().reshape(-1, 28, 28, 1))[0]
-  return {label: float(pred) for label, pred in zip(labels, preds)}
-output = gr.outputs.Label(num_top_classes=3)
-title="Sketch Recognition"
-description="This Convolution Neural Network was trained on Google's " \
-            "QuickDraw dataset with 345 classes. Try it by drawing a " \
-            "lightbulb, radio, or anything you can think of!"
-thumbnail="https://github.com/gradio-app/machine-learning-experiments/raw/master/lightbulb.png?raw=true"
-gr.Interface(predict, "sketchpad", output, live=True, title=title, analytics_enabled=False,
-             description=description, thumbnail=thumbnail, capture_session=True).launch()

data/internet_provider.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

distillbert-classification-finetuning.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import ktrain
+from ktrain import text
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+data = pd.read_csv('data/internet_provider.csv')  # Replace 'data.csv' with your actual file name
+categories = ['Slow Connection', 'Billing', 'Setup', 'No Connectivity']
+train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)
+val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True)
+model_name = "distilbert-base-uncased"
+model = text.Transformer(model_name=model_name, maxlen=512, class_names=categories)
+train_data = model.preprocess_train(train_data["Text"].tolist(), train_data["Category"].tolist())
+val_data = model.preprocess_train(val_data["Text"].tolist(), val_data["Category"].tolist())
+test_data = model.preprocess_train(test_data["Text"].tolist(), test_data["Category"].tolist())
+classifier = model.get_classifier()
+learner = ktrain.get_learner(classifier, train_data=train_data, val_data=val_data, batch_size=16)
+learner.lr_find(show_plot=True, max_epochs=20)
+learner.fit_onecycle(0.0001, 1)
+learner.validate(class_names=categories)
+learner.view_top_losses(n=5, preproc=model)
+print(train_data.iloc[100])
+predictor = ktrain.get_predictor(learner.model, preproc=model)
+x = "I have issues with my internet connection"
+prediction = predictor.predict(x)
+print(f"prediction: {prediction}")
+print(predictor.explain(x))
+predictor.save("distilbest-model")
+predictor = ktrain.load_predictor("distilbest-model")
+x = "I have issues with my internet connection"
+prediction = predictor.predict(x)
+print(f"prediction: {prediction}")
+print(predictor.explain(x))

distillbert-classification-run.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import ktrain
+from ktrain import text
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+predictor = ktrain.load_predictor("models/distilbert-base-uncased-finetuned-internet-provider")
+x = "I have issues with my internet connection"
+prediction = predictor.predict(x)
+print(f"prediction: {prediction}")
+labels = predictor.get_classes()
+probs = predictor.predict_proba(x)
+for i, label in enumerate(labels):
+    print(label, ":", probs[i])

flagged/DistilBERT/tmpxkobdgs7.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

flagged/Zero-Shot-Classification/tmp0kb5h3dj.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Customer Sentence,Zero-Shot-Classification,DistilBERT,flag,username,timestamp
2	+ ,/Users/fioranel/Projects/Case-Classification/flagged/Zero-Shot-Classification/tmp0kb5h3dj.json,/Users/fioranel/Projects/Case-Classification/flagged/DistilBERT/tmpxkobdgs7.json,,,2023-07-19 01:56:06.921642

models/distilbert-base-uncased-finetuned-internet-provider/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "/tmp/tmp1o3vy28s",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "transformers_version": "4.31.0",
+  "vocab_size": 30522
+}

models/distilbert-base-uncased-finetuned-internet-provider/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

models/{sketch_recognition/cnn.h5 → distilbert-base-uncased-finetuned-internet-provider/tf_model.h5} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:25b551a31a9ca980637231fef74428a4f7b2dcea199cbe3226fed89629741075
-size 2029392

 version https://git-lfs.github.com/spec/v1
+oid sha256:6375d12296e96db5c916233c62535f7b20fbe8b85bb62260801c14450231ec80
+size 267961288

models/distilbert-base-uncased-finetuned-internet-provider/tf_model.preproc ADDED Viewed

Binary file (2.76 kB). View file

models/distilbert-base-uncased-finetuned-internet-provider/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/distilbert-base-uncased-finetuned-internet-provider/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

models/distilbert-base-uncased-finetuned-internet-provider/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

	@@ -1 +1,5 @@
1	- tensorflow

+tensorflow
+gradio
+transformers
+openai
+ktrain

zero-shot-classification.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import gradio as gr
+from transformers import pipeline
+classifier = pipeline("zero-shot-classification")
+labels = ["Connectivity", "Billing", "Setup"]
+def predict(text):
+    preds = classifier(text, candidate_labels=labels)
+    return {label: float(pred) for label, pred in zip(preds["labels"], preds["scores"])}
+input = gr.inputs.Textbox(label="Customer Sentence")
+output = gr.outputs.Label(num_top_classes=4, label="Zero-Shot-Classification")
+title = "Case Classification"
+description = "Zero-Shot-Classification test"
+gr.Interface(predict, input, output, live=True, title=title, analytics_enabled=False,
+             description=description).launch()