File size: 6,557 Bytes
db36668
 
 
 
 
 
 
 
 
 
 
 
 
5511788
db36668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5511788
db36668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from collections import Counter
from itertools import count, groupby, islice
from operator import itemgetter
from typing import Any, Iterable, TypeVar, List, Dict, Tuple, Optional

import gradio as gr
import requests
import pandas as pd
from datasets import Features
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from requests.adapters import HTTPAdapter, Retry

from analyze import run_dataspeech
# import spaces

MAX_ROWS = 100
T = TypeVar("T")
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))


def stream_rows(dataset: str, config: str, split: str) -> Iterable[Dict[str, Any]]:
    batch_size = 100
    for i in count():
        rows_resp = session.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=10).json()
        if "error" in rows_resp:
            raise RuntimeError(rows_resp["error"])
        if not rows_resp["rows"]:
            break
        for row_item in rows_resp["rows"]:
            yield row_item["row"]

class track_iter:

    def __init__(self, it: Iterable[T]):
        self.it = it
        self.next_idx = 0

    def __iter__(self) -> T:
        for item in self.it:
            self.next_idx += 1
            yield item


def report(next_row_idx: int, num_rows: int) -> Dict[str, float]:
    if num_rows == next_row_idx:
        return f"Scan finished: {num_rows} samples analyzed"
    else:
        return f"Tagging in progress - {next_row_idx/num_rows*100}% of rows analyzed..."


# @spaces.GPU(duration=80)
def analyze_dataset(dataset: str, audio_column_name: str, text_column_name: str, configuration_name: Optional[str]  = None, split_name: Optional[str] = None) -> Tuple[str, List[List[Any]]]:
    info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
    if "error" in info_resp:
        yield "❌ " + info_resp["error"], pd.DataFrame()
        return

    if configuration_name in info_resp["dataset_info"]:
        config = configuration_name
    elif configuration_name != "" and configuration_name is not None:
        yield "❌ " + f"The configuration you've passed `{configuration_name}` was not found in the dataset configs: {', '.join(info_resp['dataset_info'].keys())}. Try again with the right config name.", gr.DataFrame()
        return
    else:
        config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))

    features = Features.from_dict(info_resp["dataset_info"][config]["features"])
    if split_name in info_resp["dataset_info"][config]["splits"]:
        split = split_name
    elif split_name != "" and split_name is not None:
        yield "❌ " + f"The splt you've passed `{split_name}` was not found in the dataset splits: {', '.join(info_resp['dataset_info'][config]['splits'])}. Try again with the right config name.", gr.DataFrame()
        return
    else:
        split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
    num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
    rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
    
    if audio_column_name not in features:
        yield "❌ " + f"The audio column name you've passed `{audio_column_name}` was not found in the dataset columns: {', '.join(features.keys())}. Try again with the right column name.", gr.DataFrame()
        return
    if text_column_name not in features:
        yield "❌ " + f"The text column name you've passed `{text_column_name}` was not found in the dataset columns: {', '.join(features.keys())}. Try again with the right column name.", gr.DataFrame()
        return
    if "gender" in features:
        yield "Gender has been detected. We'll compute pitch.", pd.DataFrame()
    
    dataframe = []
    for batch in run_dataspeech(
        rows, audio_column_name, text_column_name
    ):
        headers = list(batch[0].keys())
        batch = [list(sample.values()) for sample in batch]
        dataframe.extend(batch)
        datatype = ["str"  if col != audio_column_name else "markdown" for col in headers]
        yield (report(next_row_idx=rows.next_idx, num_rows=num_rows), gr.DataFrame(dataframe, headers=headers, datatype=datatype, wrap=True))
    yield (report(next_row_idx=rows.next_idx, num_rows=num_rows), gr.DataFrame(dataframe, headers=headers, datatype=datatype, wrap=True))

with gr.Blocks() as demo:
    gr.Markdown("# Analyze speech dataset using Data-Speech")
    gr.Markdown("The space takes an HF dataset name as an input, as well as the audio column name to analyze, and returns the speaking rate, noise level, reverberation level, monotony level and pitch. Note that pitch is only computed if a `speaker_id` column and a `gender` column are found.")
    hub_search = HuggingfaceHubSearch(
            label="Hub Dataset ID",
            placeholder="Search for dataset id on Huggingface",
            search_type="dataset",
        )
    audio_column_name = gr.Textbox(
            value="audio",
            label="Audio column name.",

        )
    text_column_name = gr.Textbox(
            value="text",
            label="Transcription column name.",
        )
    
    with gr.Accordion("(Optional) specify configuration and split of the dataset to be analysed", open=False):    
        configuration_name = gr.Textbox(
            value=None,
            label="Configuration name.",
        )
        split_name = gr.Textbox(
            value=None,
            label="Split name.",
        )

    button = gr.Button("Run Data-Speech Scan")
    outputs = [
        gr.Label(show_label=False),
        gr.DataFrame(),
    ]    
    
    
    button.click(analyze_dataset, [hub_search, audio_column_name, text_column_name, configuration_name, split_name], outputs)
    gr.Examples(
        [
            ["blabble-io/libritts_r", "audio", "text_normalized", "clean"],
            ["blabble-io/libritts_r", "audio", "text_normalized", "other"],
            ["espnet/yodas", "audio", "text", "en000",],
            ["ylacombe/english_dialects", "audio", "text"]
        ],
        [hub_search, audio_column_name, text_column_name, configuration_name],
        outputs,
        fn=analyze_dataset,
        run_on_click=True,
        cache_examples=False,
    )

demo.launch(debug=False)