Spaces:
Build error
Build error
| import time | |
| from functools import partial | |
| from typing import Iterator | |
| import gradio as gr | |
| import requests.exceptions | |
| from huggingface_hub import InferenceClient | |
| model_id = "microsoft/Phi-3-mini-4k-instruct" | |
| client = InferenceClient(model_id) | |
| GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = ( | |
| "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. " | |
| "Generate a list of 10 names of quality dataset that don't exist but sound plausible and would " | |
| "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. " | |
| "Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated to the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n1. DatasetName2 (tag1, tag2, tag3)" | |
| ) | |
| GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = ( | |
| "A ML practitioner is looking for a dataset CSV after the query '{search_query}'. " | |
| "Generate the first 5 rows of a plausible and quality CSV for the dataset '{dataset_name}'. " | |
| "You can get inspiration from related keywords '{tags}' but most importantly the dataset should correspond to the query '{search_query}'. " | |
| "Focus on quality text content and and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). " | |
| "Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**." | |
| ) | |
| default_query = "various datasets on many different subjects and topics, from classification to language modeling, from science to sport to finance to news" | |
| def stream_reponse(msg: str, max_tokens=500) -> Iterator[str]: | |
| for _ in range(3): | |
| try: | |
| for message in client.chat_completion( | |
| messages=[{"role": "user", "content": msg}], | |
| max_tokens=max_tokens, | |
| stream=True, | |
| ): | |
| yield message.choices[0].delta.content | |
| except requests.exceptions.ConnectionError as e: | |
| print(e + "\n\nRetrying in 1sec") | |
| time.sleep(1) | |
| continue | |
| break | |
| def gen_datasets(search_query: str) -> Iterator[str]: | |
| search_query = search_query[:1000] if search_query.strip() else default_query | |
| generated_text = "" | |
| for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)): | |
| generated_text += token | |
| if generated_text.endswith("\n"): | |
| yield generated_text.strip() | |
| yield generated_text.strip() | |
| print("-----\n\n" + generated_text) | |
| def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]: | |
| search_query = search_query[:1000] if search_query.strip() else default_query | |
| generated_text = "" | |
| for token in stream_reponse(GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format( | |
| search_query=search_query, | |
| dataset_name=dataset_name, | |
| tags=tags, | |
| ), max_tokens=1500): | |
| generated_text += token | |
| yield generated_text | |
| print("-----\n\n" + generated_text) | |
| NB_ITEMS_PER_PAGE = 10 | |
| default_output = """ | |
| 1. NewsEventsPredict (classification, media, trend) | |
| 2. FinancialForecast (economy, stocks, regression) | |
| 3. HealthMonitor (science, real-time, anomaly detection) | |
| 4. SportsAnalysis (classification, performance, player tracking) | |
| 5. SciLiteracyTools (language modeling, science literacy, text classification) | |
| 6. RetailSalesAnalyzer (consumer behavior, sales trend, segmentation) | |
| 7. SocialSentimentEcho (social media, emotion analysis, clustering) | |
| 8. NewsEventTracker (classification, public awareness, topical clustering) | |
| 9. HealthVitalSigns (anomaly detection, biometrics, prediction) | |
| 10. GameStockPredict (classification, finance, sports contingency) | |
| """.strip().split("\n") | |
| assert len(default_output) == NB_ITEMS_PER_PAGE | |
| css = """ | |
| .datasetButton { | |
| justify-content: start; | |
| justify-content: left; | |
| } | |
| .tags { | |
| font-size: var(--button-small-text-size); | |
| color: var(--body-text-color-subdued); | |
| } | |
| a { | |
| color: var(--body-text-color); | |
| } | |
| .topButton { | |
| justify-content: start; | |
| justify-content: left; | |
| text-align: left; | |
| background: transparent; | |
| box-shadow: none; | |
| padding-bottom: 0; | |
| } | |
| .topButton::before { | |
| content: url("data:image/svg+xml,%3Csvg style='color: rgb(209 213 219)' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' aria-hidden='true' focusable='false' role='img' width='1em' height='1em' preserveAspectRatio='xMidYMid meet' viewBox='0 0 25 25'%3E%3Cellipse cx='12.5' cy='5' fill='currentColor' fill-opacity='0.25' rx='7.5' ry='2'%3E%3C/ellipse%3E%3Cpath d='M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z' fill='currentColor'%3E%3C/path%3E%3C/svg%3E"); | |
| margin-right: .25rem; | |
| margin-left: -.125rem; | |
| margin-top: .25rem; | |
| } | |
| .bottomButton { | |
| justify-content: start; | |
| justify-content: left; | |
| text-align: left; | |
| background: transparent; | |
| box-shadow: none; | |
| font-size: var(--button-small-text-size); | |
| color: var(--body-text-color-subdued); | |
| padding-top: 0; | |
| align-items: baseline; | |
| } | |
| .bottomButton::before { | |
| content: 'tags:'; | |
| margin-right: .25rem; | |
| } | |
| .buttonsGroup { | |
| background: transparent; | |
| } | |
| .buttonsGroup:hover { | |
| background: var(--input-background-fill); | |
| } | |
| .buttonsGroup div { | |
| background: transparent; | |
| } | |
| @keyframes placeHolderShimmer{ | |
| 0%{ | |
| background-position: -468px 0 | |
| } | |
| 100%{ | |
| background-position: 468px 0 | |
| } | |
| } | |
| .linear-background { | |
| animation-duration: 1s; | |
| animation-fill-mode: forwards; | |
| animation-iteration-count: infinite; | |
| animation-name: placeHolderShimmer; | |
| animation-timing-function: linear; | |
| background-image: linear-gradient(to right, var(--body-text-color-subdued) 8%, #dddddd11 18%, var(--body-text-color-subdued) 33%); | |
| background-size: 1000px 104px; | |
| color: transparent; | |
| background-clip: text; | |
| } | |
| """ | |
| def search_datasets(search_query): | |
| output_values = [ | |
| gr.Button("β¬β¬β¬β¬β¬β¬", elem_classes="topButton linear-background"), | |
| gr.Button("ββββ, ββββ, ββββ", elem_classes="bottomButton linear-background") | |
| ] * NB_ITEMS_PER_PAGE | |
| for generated_text in gen_datasets(search_query): | |
| if "I'm sorry" in generated_text: | |
| raise gr.Error("Error: inappropriate content") | |
| lines = [line for line in generated_text.split("\n") if line and line.split(".", 1)[0].isnumeric()][:NB_ITEMS_PER_PAGE] | |
| for i, line in enumerate(lines): | |
| dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1) | |
| output_values[2 * i] = gr.Button(dataset_name, elem_classes="topButton") | |
| output_values[2 * i + 1] = gr.Button(tags, elem_classes="bottomButton") | |
| yield output_values | |
| def show_dataset(search_query, *buttons_values, i): | |
| dataset_name, tags = buttons_values[2 * i : 2 * i + 2] | |
| dataset_title = f"# {dataset_name}\n\n tags: {tags}\n\n _Note: This is an AI-generated dataset so its content may be inaccurate or false_" | |
| yield gr.Column(visible=False), gr.Column(visible=True), dataset_title, "" | |
| for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags): | |
| yield gr.Column(), gr.Column(), dataset_title, generated_text | |
| def show_search_page(): | |
| return gr.Column(visible=True), gr.Column(visible=False) | |
| def generate_full_dataset(): | |
| raise gr.Error("Not implemented yet sorry ! Give me some feedbacks in the Community tab in the meantime ;)") | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| with gr.Column(scale=10): | |
| gr.Markdown( | |
| "# π€ Infinite Dataset Hub βΎοΈ\n\n" | |
| "An endless catalog of datasets, created just for you.\n\n" | |
| ) | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| with gr.Column() as search_page: | |
| with gr.Row(): | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| with gr.Column(scale=9): | |
| search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False) | |
| with gr.Column(min_width=64): | |
| search_button = gr.Button("π", variant="primary") | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| inputs = [search_bar] | |
| show_dataset_outputs = [search_page] | |
| with gr.Row(): | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| with gr.Column(scale=10): | |
| buttons = [] | |
| for i in range(10): | |
| line = default_output[i] | |
| dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1) | |
| with gr.Group(elem_classes="buttonsGroup"): | |
| top = gr.Button(dataset_name, elem_classes="topButton") | |
| bottom = gr.Button(tags, elem_classes="bottomButton") | |
| buttons += [top, bottom] | |
| top.click(partial(show_dataset, i=i), inputs=inputs, outputs=show_dataset_outputs) | |
| bottom.click(partial(show_dataset, i=i), inputs=inputs, outputs=show_dataset_outputs) | |
| inputs += buttons | |
| gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_") | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons) | |
| search_button.click(search_datasets, inputs=search_bar, outputs=buttons) | |
| with gr.Column(visible=False) as dataset_page: | |
| with gr.Row(): | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| with gr.Column(scale=10): | |
| dataset_title = gr.Markdown() | |
| dataset_content = gr.Markdown() | |
| with gr.Row(): | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| with gr.Column(): | |
| generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary") | |
| generate_full_dataset_button.click(generate_full_dataset) | |
| back_button = gr.Button("< Back", size="sm") | |
| back_button.click(show_search_page, inputs=[], outputs=[search_page, dataset_page]) | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| with gr.Column(scale=4, min_width=0): | |
| pass | |
| show_dataset_outputs += [dataset_page, dataset_title, dataset_content] | |
| demo.launch() | |