File size: 3,721 Bytes
7ab64d3
 
 
 
 
8b71023
7ab64d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b71023
7ab64d3
 
 
 
 
 
8b71023
7ab64d3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
from huggingface_hub import HfApi
import duckdb
from datasets import load_dataset
import pandas as pd
import os, time, sys, json, random

custom_css="""
* { animation: gow 3s 1 forwards; } @keyframes gow { from { transform: scale(0.1); } to { transform: scale(1.0); } }
"""
head_js="""
<script>var mouse = {x: undefined,y: undefined};var newX;var newY;window.addEventListener('mousemove',function (event) {mouse.x = event.x;mouse.y = event.y;});window.addEventListener('touchstart', function (event) {let touchtart = event.touches[0];event.preventDefault();mouse.x = touchtart.clientX;mouse.y = touchtart.clientY;newX = mouse.x;newY = mouse.y;var colr = 'hsla('+Math.floor(Math.random() * 360)+','+Math.floor(Math.random() * 100)+'%,'+Math.floor(Math.random() * 50)+'%,'+(Math.random() * 1)+')';document.querySelectorAll('*').forEach(item =>{ item.style.backgroundColor=colr; });}, false);var bkd = 'url('+String("https://huggingface.co/front/assets/huggingface_logo-noborder.svg")+')';</script>
"""

api = HfApi()
datasets=api.list_datasets(filter="task_categories:text-generation",language="en",gated=False,limit=100)
outf='./output.csv'
lst=[]

def looky(value):
    datasets=api.list_datasets(search=f"{value}",language="en",gated=False,limit=100)
    return gr.CheckboxGroup([d.id for d in datasets], label="Select Datasets")
def preview(selected):
    lst=[]
    for selecd in selected:
        datum=load_dataset(selecd, split='train', streaming=True).take(3)
        lst.extend(datum)
    fd=pd.DataFrame(lst)
    
    return gr.Dataframe(headers=["Dataset", "Sample"], value=fd)
    
def build_dataset(selected_datasets, num_samples):
    outf='./output.csv'
    con = duckdb.connect(database=':memory:')
    combined_data = []
    
    for dataset in selected_datasets:
        
        data = load_dataset(dataset, split='train', streaming=True).take(num_samples)
        combined_data.extend(data)
    df = pd.DataFrame(combined_data)
    con.execute("CREATE TABLE dataset AS SELECT * FROM df")
    result = con.execute("SELECT * FROM dataset").fetchall()
    con.execute("COPY (SELECT * FROM dataset) TO 'output.csv' (HEADER, DELIMITER ',');")
    return result,outf

with gr.Blocks(head=head_js,css=custom_css) as iface:
    frst_sample=gr.Dataframe(value=None,label="View 3 Samples per selected dataset")

    srchbx=gr.Textbox(label="Search datasets",placeholder="Search Datasets on the Hub. Type query..hit Enter.. this will update the dataset list below..")
    with gr.Accordion("Multi-Select Datasets", open=False,):
        with gr.Row():
            dataset_selector = gr.CheckboxGroup([d.id for d in datasets], label="Multi-Select Datasets")
    num_samples_input = gr.Number(value=10, label="Number of Samples to retrieve per Dataset")
    build_button = gr.Button("Build Dataset", elem_id="moish")
    
    out_way = gr.File()
    output_display = gr.Dataframe(headers=["Dataset", "Sample"])
    build_button.click(fn=build_dataset,inputs=[dataset_selector, num_samples_input],outputs=[output_display,out_way])
    dataset_selector.change(preview,dataset_selector,frst_sample)
    srchbx.change(looky,srchbx,dataset_selector)
    iface.load(None,None,None,js="""() =>{var colr = 'rgba('+Math.floor(Math.random() * 256)+','+Math.floor(Math.random() * 256)+','+Math.floor(Math.random() * 256)+','+(Math.random() * 1)+')'; document.querySelectorAll('*').forEach(item =>{ item.style.backgroundColor=colr; }); var tin = document.getElementById('moish'); var parents=[]; function getAllParentNodes(element) {while (element.parentNode) {element = element.parentNode; element.style.background = bkd; parents.push(element); }; }; getAllParentNodes(tin);}""",)

iface.launch(debug=True)