dataset-tldr

Paused

App Files Files Community

davanstrien HF staff commited on Jan 24

Commit

aec7db1

•

1 Parent(s): 1ebedb6

Update dependencies and add Dockerfile

Browse files

Files changed (5) hide show

Dockerfile +22 -0
README.md +2 -4
app.py → main.py +35 -41
requirements.in +2 -3
requirements.txt +27 -178

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.11
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/code
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,9 @@
 ---
 title: Dataset Language Detection
-emoji: 🦀
 colorFrom: purple
 colorTo: yellow
-sdk: gradio
-sdk_version: 4.15.0
-app_file: app.py
 pinned: false
 license: mit
 ---

 ---
 title: Dataset Language Detection
+emoji: 🌐
 colorFrom: purple
 colorTo: yellow
+sdk: docker
 pinned: false
 license: mit
 ---

app.py → main.py RENAMED Viewed

@@ -1,16 +1,18 @@
 import os
 import random
 from statistics import mean
-from typing import Iterator, Union
 import fasttext
-import gradio as gr
 from dotenv import load_dotenv
-from httpx import Client, Timeout
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
 from toolz import concat, groupby, valmap
 logger = logging.get_logger(__name__)
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -23,7 +25,7 @@ headers = {
 }
 timeout = Timeout(60, read=120)
 client = Client(headers=headers, timeout=timeout)
-# async_client = AsyncClient(headers=headers, timeout=timeout)
 # non exhaustive list of columns that might contain text which can be used for language detection
 # we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
 TARGET_COLUMN_NAMES = {
@@ -54,9 +56,9 @@ def datasets_server_valid_rows(hub_id: str):
         return False
-def get_first_config_and_split_name(hub_id: str):
     try:
-        resp = client.get(
             f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
         )
@@ -67,21 +69,21 @@ def get_first_config_and_split_name(hub_id: str):
         return None
-def get_dataset_info(hub_id: str, config: str | None = None):
     if config is None:
         config = get_first_config_and_split_name(hub_id)
         if config is None:
             return None
         else:
             config = config[0]
-    resp = client.get(
         f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
     )
     resp.raise_for_status()
     return resp.json()
-def get_random_rows(
     hub_id: str,
     total_length: int,
     number_of_rows: int,
@@ -99,7 +101,7 @@ def get_random_rows(
         url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
         logger.info(f"Fetching {url}")
         print(url)
-        response = client.get(url)
         if response.status_code == 200:
             data = response.json()
             batch_rows = data.get("rows")
@@ -139,9 +141,15 @@ def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterat
 FASTTEXT_PREFIX_LENGTH = 9  # fasttext labels are formatted like "__label__eng_Latn"
 # model = load_model(DEFAULT_FAST_TEXT_MODEL)
 model = fasttext.load_model(
-    hf_hub_download("facebook/fasttext-language-identification", "model.bin")
 )
@@ -187,36 +195,43 @@ def predict_rows(rows, target_column, language_threshold_percent=0.2):
     }
-def predict_language(
     hub_id: str,
     config: str | None = None,
     split: str | None = None,
     max_request_calls: int = 10,
     number_of_rows: int = 1000,
-) -> dict[str, float | str]:
     is_valid = datasets_server_valid_rows(hub_id)
     if not is_valid:
-        gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
     if not config:
-        config, split = get_first_config_and_split_name(hub_id)
-    info = get_dataset_info(hub_id, config)
     if info is None:
-        gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
     if dataset_info := info.get("dataset_info"):
         total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
         features = dataset_info.get("features")
         column_names = set(features.keys())
         logger.info(f"Column names: {column_names}")
         if not set(column_names).intersection(TARGET_COLUMN_NAMES):
-            raise gr.Error(
                 f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
             )
         for column in TARGET_COLUMN_NAMES:
             if column in column_names:
                 target_column = column
                 logger.info(f"Using column {target_column} for language detection")
                 break
-        random_rows = get_random_rows(
             hub_id,
             total_rows_for_split,
             number_of_rows,
@@ -230,24 +245,3 @@ def predict_language(
         predictions["config"] = config
         predictions["split"] = split
         return predictions
-app_title = "Dataset Language Detection"
-app_description = "Detect the language of a dataset on the Hub"
-inputs = [
-    gr.Text(label="dataset id"),
-    gr.Textbox(
-        None,
-        label="config",
-    ),
-    gr.Textbox(None, label="split"),
-]
-interface = gr.Interface(
-    predict_language,
-    inputs=inputs,
-    outputs="json",
-    title=app_title,
-    article=app_description,
-)
-interface.queue()
-interface.launch()

 import os
 import random
+from pathlib import Path
 from statistics import mean
+from typing import Any, Iterator, Union
 import fasttext
 from dotenv import load_dotenv
+from fastapi import FastAPI
+from httpx import AsyncClient, Client, Timeout
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
 from toolz import concat, groupby, valmap
+app = FastAPI()
 logger = logging.get_logger(__name__)
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 }
 timeout = Timeout(60, read=120)
 client = Client(headers=headers, timeout=timeout)
+async_client = AsyncClient(headers=headers, timeout=timeout)
 # non exhaustive list of columns that might contain text which can be used for language detection
 # we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
 TARGET_COLUMN_NAMES = {
         return False
+async def get_first_config_and_split_name(hub_id: str):
     try:
+        resp = await async_client.get(
             f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
         )
         return None
+async def get_dataset_info(hub_id: str, config: str | None = None):
     if config is None:
         config = get_first_config_and_split_name(hub_id)
         if config is None:
             return None
         else:
             config = config[0]
+    resp = await async_client.get(
         f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
     )
     resp.raise_for_status()
     return resp.json()
+async def get_random_rows(
     hub_id: str,
     total_length: int,
     number_of_rows: int,
         url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
         logger.info(f"Fetching {url}")
         print(url)
+        response = await async_client.get(url)
         if response.status_code == 200:
             data = response.json()
             batch_rows = data.get("rows")
 FASTTEXT_PREFIX_LENGTH = 9  # fasttext labels are formatted like "__label__eng_Latn"
 # model = load_model(DEFAULT_FAST_TEXT_MODEL)
+Path("code/models").mkdir(parents=True, exist_ok=True)
 model = fasttext.load_model(
+    hf_hub_download(
+        "facebook/fasttext-language-identification",
+        "model.bin",
+        cache_dir="code/models",
+        local_dir="code/models",
+        local_dir_use_symlinks=False,
+    )
 )
     }
+@app.get("/items/{hub_id}")
+async def predict_language(
     hub_id: str,
     config: str | None = None,
     split: str | None = None,
     max_request_calls: int = 10,
     number_of_rows: int = 1000,
+) -> dict[Any, Any] | None:
     is_valid = datasets_server_valid_rows(hub_id)
     if not is_valid:
+        logger.error(f"Dataset {hub_id} is not accessible via the datasets server.")
+    if not config and not split:
+        config, split = await get_first_config_and_split_name(hub_id)
     if not config:
+        config, _ = await get_first_config_and_split_name(hub_id)
+    if not split:
+        _, split = await get_first_config_and_split_name(hub_id)
+    info = await get_dataset_info(hub_id, config)
     if info is None:
+        logger.error(f"Dataset {hub_id} is not accessible via the datasets server.")
+        return None
     if dataset_info := info.get("dataset_info"):
         total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
         features = dataset_info.get("features")
         column_names = set(features.keys())
         logger.info(f"Column names: {column_names}")
         if not set(column_names).intersection(TARGET_COLUMN_NAMES):
+            logger.error(
                 f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
             )
+            return None
         for column in TARGET_COLUMN_NAMES:
             if column in column_names:
                 target_column = column
                 logger.info(f"Using column {target_column} for language detection")
                 break
+        random_rows = await get_random_rows(
             hub_id,
             total_rows_for_split,
             number_of_rows,
         predictions["config"] = config
         predictions["split"] = split
         return predictions

requirements.in CHANGED Viewed

@@ -3,7 +3,6 @@ httpx
 huggingface_hub
 rich
 toolz
-gradio
 python-dotenv
-datasets
-iso639-lang

 huggingface_hub
 rich
 toolz
 python-dotenv
+uvicorn[standard]
+fastapi

requirements.txt CHANGED Viewed

@@ -4,27 +4,13 @@
 #
 #    pip-compile
 #
-aiofiles==23.2.1
-    # via gradio
-aiohttp==3.9.1
-    # via
-    #   datasets
-    #   fsspec
-aiosignal==1.3.1
-    # via aiohttp
-altair==5.2.0
-    # via gradio
 annotated-types==0.6.0
     # via pydantic
 anyio==4.2.0
     # via
     #   httpx
     #   starlette
-attrs==23.2.0
-    # via
-    #   aiohttp
-    #   jsonschema
-    #   referencing
 certifi==2023.11.17
     # via
     #   httpcore
@@ -33,223 +19,86 @@ certifi==2023.11.17
 charset-normalizer==3.3.2
     # via requests
 click==8.1.7
-    # via
-    #   typer
-    #   uvicorn
-colorama==0.4.6
-    # via typer
-contourpy==1.2.0
-    # via matplotlib
-cycler==0.12.1
-    # via matplotlib
-datasets==2.14.4
-    # via -r requirements.in
-dill==0.3.7
-    # via
-    #   datasets
-    #   multiprocess
 fastapi==0.109.0
-    # via gradio
 fasttext==0.9.2
     # via -r requirements.in
-ffmpy==0.3.1
-    # via gradio
 filelock==3.13.1
     # via huggingface-hub
-fonttools==4.47.2
-    # via matplotlib
-frozenlist==1.4.1
-    # via
-    #   aiohttp
-    #   aiosignal
-fsspec[http]==2023.12.2
-    # via
-    #   datasets
-    #   gradio-client
-    #   huggingface-hub
-gradio==4.15.0
-    # via -r requirements.in
-gradio-client==0.8.1
-    # via gradio
 h11==0.14.0
     # via
     #   httpcore
     #   uvicorn
 httpcore==1.0.2
     # via httpx
 httpx==0.26.0
-    # via
-    #   -r requirements.in
-    #   gradio
-    #   gradio-client
 huggingface-hub==0.20.3
-    # via
-    #   -r requirements.in
-    #   datasets
-    #   gradio
-    #   gradio-client
 idna==3.6
     # via
     #   anyio
     #   httpx
     #   requests
-    #   yarl
-importlib-resources==6.1.1
-    # via gradio
-iso639-lang==2.2.2
-    # via -r requirements.in
-jinja2==3.1.3
-    # via
-    #   altair
-    #   gradio
-jsonschema==4.21.1
-    # via altair
-jsonschema-specifications==2023.12.1
-    # via jsonschema
-kiwisolver==1.4.5
-    # via matplotlib
 markdown-it-py==3.0.0
     # via rich
-markupsafe==2.1.4
-    # via
-    #   gradio
-    #   jinja2
-matplotlib==3.8.2
-    # via gradio
 mdurl==0.1.2
     # via markdown-it-py
-multidict==6.0.4
-    # via
-    #   aiohttp
-    #   yarl
-multiprocess==0.70.15
-    # via datasets
 numpy==1.26.3
-    # via
-    #   altair
-    #   contourpy
-    #   datasets
-    #   fasttext
-    #   gradio
-    #   matplotlib
-    #   pandas
-    #   pyarrow
-orjson==3.9.12
-    # via gradio
 packaging==23.2
-    # via
-    #   altair
-    #   datasets
-    #   gradio
-    #   gradio-client
-    #   huggingface-hub
-    #   matplotlib
-pandas==2.2.0
-    # via
-    #   altair
-    #   datasets
-    #   gradio
-pillow==10.2.0
-    # via
-    #   gradio
-    #   matplotlib
-pyarrow==15.0.0
-    # via datasets
 pybind11==2.11.1
     # via fasttext
 pydantic==2.5.3
-    # via
-    #   fastapi
-    #   gradio
 pydantic-core==2.14.6
     # via pydantic
-pydub==0.25.1
-    # via gradio
 pygments==2.17.2
     # via rich
-pyparsing==3.1.1
-    # via matplotlib
-python-dateutil==2.8.2
-    # via
-    #   matplotlib
-    #   pandas
 python-dotenv==1.0.1
-    # via -r requirements.in
-python-multipart==0.0.6
-    # via gradio
-pytz==2023.3.post1
-    # via pandas
 pyyaml==6.0.1
     # via
-    #   datasets
-    #   gradio
     #   huggingface-hub
-referencing==0.32.1
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
 requests==2.31.0
-    # via
-    #   datasets
-    #   fsspec
-    #   huggingface-hub
 rich==13.7.0
-    # via
-    #   -r requirements.in
-    #   typer
-rpds-py==0.17.1
-    # via
-    #   jsonschema
-    #   referencing
-ruff==0.1.14
-    # via gradio
-semantic-version==2.10.0
-    # via gradio
-shellingham==1.5.4
-    # via typer
-six==1.16.0
-    # via python-dateutil
 sniffio==1.3.0
     # via
     #   anyio
     #   httpx
 starlette==0.35.1
     # via fastapi
-tomlkit==0.12.0
-    # via gradio
 toolz==0.12.0
-    # via
-    #   -r requirements.in
-    #   altair
 tqdm==4.66.1
-    # via
-    #   datasets
-    #   huggingface-hub
-typer[all]==0.9.0
-    # via
-    #   gradio
-    #   typer
 typing-extensions==4.9.0
     # via
     #   fastapi
-    #   gradio
-    #   gradio-client
     #   huggingface-hub
     #   pydantic
     #   pydantic-core
-    #   typer
-tzdata==2023.4
-    # via pandas
 urllib3==2.1.0
     # via requests
-uvicorn==0.27.0
-    # via gradio
 websockets==11.0.3
-    # via gradio-client
-xxhash==3.4.1
-    # via datasets
-yarl==1.9.4
-    # via aiohttp
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools

 #
 #    pip-compile
 #
 annotated-types==0.6.0
     # via pydantic
 anyio==4.2.0
     # via
     #   httpx
     #   starlette
+    #   watchfiles
 certifi==2023.11.17
     # via
     #   httpcore
 charset-normalizer==3.3.2
     # via requests
 click==8.1.7
+    # via uvicorn
 fastapi==0.109.0
+    # via -r requirements.in
 fasttext==0.9.2
     # via -r requirements.in
 filelock==3.13.1
     # via huggingface-hub
+fsspec==2023.12.2
+    # via huggingface-hub
 h11==0.14.0
     # via
     #   httpcore
     #   uvicorn
 httpcore==1.0.2
     # via httpx
+httptools==0.6.1
+    # via uvicorn
 httpx==0.26.0
+    # via -r requirements.in
 huggingface-hub==0.20.3
+    # via -r requirements.in
 idna==3.6
     # via
     #   anyio
     #   httpx
     #   requests
 markdown-it-py==3.0.0
     # via rich
 mdurl==0.1.2
     # via markdown-it-py
 numpy==1.26.3
+    # via fasttext
 packaging==23.2
+    # via huggingface-hub
 pybind11==2.11.1
     # via fasttext
 pydantic==2.5.3
+    # via fastapi
 pydantic-core==2.14.6
     # via pydantic
 pygments==2.17.2
     # via rich
 python-dotenv==1.0.1
+    # via
+    #   -r requirements.in
+    #   uvicorn
 pyyaml==6.0.1
     # via
     #   huggingface-hub
+    #   uvicorn
 requests==2.31.0
+    # via huggingface-hub
 rich==13.7.0
+    # via -r requirements.in
 sniffio==1.3.0
     # via
     #   anyio
     #   httpx
 starlette==0.35.1
     # via fastapi
 toolz==0.12.0
+    # via -r requirements.in
 tqdm==4.66.1
+    # via huggingface-hub
 typing-extensions==4.9.0
     # via
     #   fastapi
     #   huggingface-hub
     #   pydantic
     #   pydantic-core
 urllib3==2.1.0
     # via requests
+uvicorn[standard]==0.27.0
+    # via -r requirements.in
+uvloop==0.19.0
+    # via uvicorn
+watchfiles==0.21.0
+    # via uvicorn
 websockets==11.0.3
+    # via uvicorn
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools