Spaces:

Hacker1337
/

article_classifier

Sleeping

Hacker1337 commited on Jun 15

Commit

2904d0e

1 Parent(s): fef1ab3

added loading of my model

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import numpy as np
 import random
@@ -7,13 +8,25 @@ from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
 if torch.cuda.is_available():
     torch_dtype = torch.float16
 else:
     torch_dtype = torch.float32
 pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
 pipe = pipe.to(device)

+from transformers import AutoModelForSequenceClassification
 import gradio as gr
 import numpy as np
 import random
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "stabilityai/sdxl-turbo"
 if torch.cuda.is_available():
     torch_dtype = torch.float16
 else:
     torch_dtype = torch.float32
+from article_classifier.dataset import labels, id2label, label2id, categorie2human
+model_path = "distilbert/distilbert-base-cased" # todo, replace with hacker1337/article-classifier
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_path,
+    num_labels=len(id2label),
+    id2label=id2label,
+    label2id=label2id,
+    problem_type="multi_label_classification",
+)
 pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
 pipe = pipe.to(device)

dataset.py ADDED Viewed

+labels = ["CV", "AI", "ML", "NE", "CL"]
+id2label = {i: label for i, label in enumerate(labels)}
+label2id = {label: i for i, label in enumerate(labels)}
+categorie2human = {
+    "CV": "Computer Vision",
+    "AI": "Artificial Intelligence",
+    "ML": "Machine Learning",
+    "NE": "Neural and Evolutionary Computing",
+    "CL": "Computation and Language"
+}
+def load_arxiv_dataset():
+    import kagglehub
+    import os
+    from datasets import load_dataset
+    # Download latest version
+    path = kagglehub.dataset_download("spsayakpaul/arxiv-paper-abstracts")
+    dataset = load_dataset(
+        "csv",
+        data_files=os.path.join(path, "arxiv_data.csv"),
+        encoding="utf-8",
+        split="train"
+    )
+    # convert string to lists
+    import ast
+    def parse_terms(example):
+        example["terms"] = ast.literal_eval(example["terms"])
+        return example
+    dataset = dataset.map(parse_terms)
+    return dataset