commonlit-student-summaries

Sleeping

App Files Files Community

Steffen Slavetinsky commited on Aug 24, 2023

Commit

6f196c6

•

1 Parent(s): 3e94b6c

add dataset and install from latest commit

Browse files

Files changed (3) hide show

Dockerfile +1 -1
dataset.csv +0 -0
run.py +15 -66

Dockerfile CHANGED Viewed

@@ -6,7 +6,7 @@ ENV HOME=/code
 RUN apt install curl
 RUN pip install pip -U
-RUN pip install renumics-spotlight==1.3.0
 RUN pip install datasets
 COPY prepare.py .

 RUN apt install curl
 RUN pip install pip -U
+RUN pip install git+https://github.com/Renumics/spotlight.git@ac8ed08d9a179c15fccfa5f0fa4a0a71fbc3dbe7
 RUN pip install datasets
 COPY prepare.py .

dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

run.py CHANGED Viewed

@@ -1,72 +1,21 @@
-import pickle
-from renumics import spotlight
-import os
-import requests
 import pandas as pd
 from renumics import spotlight
-from renumics.spotlight.analysis import DataIssue
 if __name__ == "__main__":
-    cache_file = "dataset_cache.pkl"
-    if os.path.exists(cache_file):
-        # Load dataset from cache
-        with open(cache_file, "rb") as file:
-            df = pickle.load(file)
-        print("Dataset loaded from cache.")
-        label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
-        label_issue = DataIssue(
-            severity="medium",
-            title="label-issue",
-            rows=label_issue_rows,
-            description="Label issue found by cleanlab - Review and correct if necessary",
-        )
-        outlier_issue_row = (
-            df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
-        )
-        outlier_issue = DataIssue(
-            severity="medium",
-            title="outlier-issue",
-            rows=outlier_issue_row,
-            description="Outlier score < 0.6 - Review and remove or collect more data",
         )
-        near_duplicate_issue_row = (
-            df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
-        )
-        near_duplicate_issue = DataIssue(
-            severity="medium",
-            title="near-duplicate-issue",
-            rows=near_duplicate_issue_row,
-            description="Near duplicate issue found by cleanlab - Review and remove if necessary",
-        )
-        while True:
-            dtypes = {
-                "image": spotlight.Image,
-                "image_full": spotlight.Image,
-                "embedding": spotlight.Embedding,
-                "embedding_reduced": spotlight.Embedding,
-                "probabilities": spotlight.Embedding,
-            }
-            view = spotlight.show(
-                df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}),
-                dtype=dtypes,
-                issues=[label_issue,outlier_issue,near_duplicate_issue],
-                layout="layout.json",
-                port=7860,
-                host="0.0.0.0",
-                allow_filebrowsing=False,
-            )
-            view.close()
-    else:
-        print(f"Dataset {cache_file} not found. Please run prepare.py first.")

 import pandas as pd
 from renumics import spotlight
 if __name__ == "__main__":
+    df = pd.read_csv("dataset.csv")
+    while True:
+        dtypes = {
+            "text_len": float,
+            "text_embedding": spotlight.Embedding,
+        }
+        view = spotlight.show(
+            df,
+            dtype=dtypes,
+            layout="layout.json",
+            port=7860,
+            host="0.0.0.0",
+            allow_filebrowsing=False,
         )
+        view.close()