dataset-tldr

Paused

App Files Files Community

davanstrien HF staff commited on Jan 24

Commit

ef19caa

•

1 Parent(s): 9915c6f

Refactor app.py: Import modules, update function parameters, and improve logging

Browse files

Files changed (1) hide show

app.py +24 -21

app.py CHANGED Viewed

@@ -1,16 +1,15 @@
-import gradio as gr
-from httpx import Client
-import random
 import os
 import fasttext
-from huggingface_hub import hf_hub_download
-from typing import Union
-from typing import Iterator
 from dotenv import load_dotenv
-from toolz import groupby, valmap, concat
-from statistics import mean
-from httpx import Timeout
 from huggingface_hub.utils import logging
 logger = logging.get_logger(__name__)
 load_dotenv()
@@ -24,6 +23,7 @@ headers = {
 }
 timeout = Timeout(60, read=120)
 client = Client(headers=headers, timeout=timeout)
 # non exhaustive list of columns that might contain text which can be used for language detection
 # we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
 TARGET_COLUMN_NAMES = {
@@ -73,10 +73,10 @@ def get_dataset_info(hub_id: str, config: str | None = None):
 def get_random_rows(
-    hub_id,
-    total_length,
-    number_of_rows,
-    max_request_calls,
     config="default",
     split="train",
 ):
@@ -88,8 +88,9 @@ def get_random_rows(
     for _ in range(min(max_request_calls, number_of_rows // rows_per_call)):
         offset = random.randint(0, total_length - rows_per_call)
         url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
         response = client.get(url)
         if response.status_code == 200:
             data = response.json()
             batch_rows = data.get("rows")
@@ -107,10 +108,6 @@ def load_model(repo_id: str) -> fasttext.FastText._FastText:
     return fasttext.load_model(model_path)
-# def predict_language_for_rows(rows: list[dict], target_column_names: list[str] | str):
-#     pass
 def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterator[str]:
     for row in rows:
         if isinstance(row, str):
@@ -186,7 +183,8 @@ def predict_language(
     config: str | None = None,
     split: str | None = None,
     max_request_calls: int = 10,
-):
     is_valid = datasets_server_valid_rows(hub_id)
     if not is_valid:
         gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
@@ -202,7 +200,7 @@ def predict_language(
         logger.info(f"Column names: {column_names}")
         if not set(column_names).intersection(TARGET_COLUMN_NAMES):
             raise gr.Error(
-                f"Dataset {hub_id} does not contain any of the target columns {TARGET_COLUMN_NAMES}"
             )
         for column in TARGET_COLUMN_NAMES:
             if column in column_names:
@@ -210,7 +208,12 @@ def predict_language(
                 logger.info(f"Using column {target_column} for language detection")
                 break
         random_rows = get_random_rows(
-            hub_id, total_rows_for_split, 1000, max_request_calls, config, split
         )
         logger.info(f"Predicting language for {len(random_rows)} rows")
         predictions = predict_rows(random_rows, target_column)

 import os
+import random
+from statistics import mean
+from typing import Iterator, Union
 import fasttext
+import gradio as gr
 from dotenv import load_dotenv
+from httpx import Client, Timeout
+from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
+from toolz import concat, groupby, valmap
 logger = logging.get_logger(__name__)
 load_dotenv()
 }
 timeout = Timeout(60, read=120)
 client = Client(headers=headers, timeout=timeout)
+# async_client = AsyncClient(headers=headers, timeout=timeout)
 # non exhaustive list of columns that might contain text which can be used for language detection
 # we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
 TARGET_COLUMN_NAMES = {
 def get_random_rows(
+    hub_id: str,
+    total_length: int,
+    number_of_rows: int,
+    max_request_calls: int,
     config="default",
     split="train",
 ):
     for _ in range(min(max_request_calls, number_of_rows // rows_per_call)):
         offset = random.randint(0, total_length - rows_per_call)
         url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
+        logger.info(f"Fetching {url}")
+        print(url)
         response = client.get(url)
         if response.status_code == 200:
             data = response.json()
             batch_rows = data.get("rows")
     return fasttext.load_model(model_path)
 def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterator[str]:
     for row in rows:
         if isinstance(row, str):
     config: str | None = None,
     split: str | None = None,
     max_request_calls: int = 10,
+    number_of_rows: int = 1000,
+) -> dict[str, float | str]:
     is_valid = datasets_server_valid_rows(hub_id)
     if not is_valid:
         gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
         logger.info(f"Column names: {column_names}")
         if not set(column_names).intersection(TARGET_COLUMN_NAMES):
             raise gr.Error(
+                f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
             )
         for column in TARGET_COLUMN_NAMES:
             if column in column_names:
                 logger.info(f"Using column {target_column} for language detection")
                 break
         random_rows = get_random_rows(
+            hub_id,
+            total_rows_for_split,
+            number_of_rows,
+            max_request_calls,
+            config,
+            split,
         )
         logger.info(f"Predicting language for {len(random_rows)} rows")
         predictions = predict_rows(random_rows, target_column)