Spaces:

airtrain-ai
/

fineweb-edu-fortified-search-demo

Runtime error

App Files Files Community

josh-sematic commited on Aug 12, 2024

Commit

c736f27

1 Parent(s): e1e3da4

Add app

Browse files

Files changed (4) hide show

.gitignore +3 -0
README.md +25 -7
app.py +174 -0
requirements.txt +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv
+.csv
+data

README.md CHANGED Viewed

@@ -1,13 +1,31 @@
 ---
-title: Fineweb Edu Fortified Search Demo
-emoji: 🏃
-colorFrom: indigo
-colorTo: yellow
 sdk: gradio
-sdk_version: 4.41.0
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Fineweb-edu-fortified Semantic Search Demo
+emoji: 📚
 sdk: gradio
+sdk_version: 4.31.5
 app_file: app.py
 pinned: false
+datasets:
+- airtrain-ai/fineweb-edu-fortified
+- HuggingFaceFW/fineweb-edu
+models:
+- TaylorAI/bge-micro
 ---
+# Semantic Search on Fineweb-edu-fortified sample
+This performs semantic search on one crawl ({{CRAWL_DUMP}}) from Fineweb-edu-fortified.
+It is intended to illustrate the contents of
+[fineweb-edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu)
+and
+[fineweb-edu-fortified](https://huggingface.co/datasets/airtrain-ai/fineweb-edu-fortified).
+To explore Fineweb-edu-fortified further, you can view automatic clustering, embedding
+projections, and more for a 500k row sample using
+[this Airtrain dashboard](https://app.airtrain.ai/dataset/c232b33f-4f4a-49a7-ba55-8167a5f433da/null/1/0).
+The embeddings are the ones present in the dataset itself, and the same embedding model
+is used to embed your search phrase. The search is performed using the 15 rows with the
+closest embedding vectors to the embedding of the search phrase.
+The search data is lazily loaded, so shortly after
+the space is launched it may not yet have the full corpus of text from that crawl available
+for search. Refer to 'Rows searched' to see how many rows were searched across to retrieve the results.

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+import time
+from itertools import islice
+import shutil
+from threading import Thread
+import lancedb
+import gradio as gr
+import polars as pl
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+STYLE = """
+.gradio-container td span {
+    overflow: auto !important;
+}
+""".strip()
+#
+EMBEDDING_MODEL = SentenceTransformer("TaylorAI/bge-micro")
+MAX_N_ROWS = 3_000_000
+N_ROWS_BATCH = 5_000
+N_SEARCH_RESULTS = 15
+CRAWL_DUMP = "CC-MAIN-2020-05"
+DB = None
+DISPLAY_COLUMNS = [
+    "text",
+    "url",
+    "token_count",
+    "count",
+]
+DISPLAY_COLUMN_TYPES = [
+    "str",
+    "str",
+    "number",
+    "number",
+]
+DISPLAY_COLUMN_WIDTHS = [
+    "300px",
+    "100px",
+    "50px",
+    "25px",
+]
+def rename_embedding_column(row):
+    vector = row["embedding"]
+    row["vector"] = vector
+    del row["embedding"]
+    return row
+def read_header_markdown() -> str:
+    with open("./README.md", "r") as fp:
+        text = fp.read(-1)
+    # Get only the markdown following the HF metadata section.
+    text = text.split("\n---\n")[-1]
+    return text.replace("{{CRAWL_DUMP}}", CRAWL_DUMP)
+def db():
+    global DB
+    if DB is None:
+        DB = lancedb.connect("data")
+    return DB
+def load_data_sample():
+    time.sleep(5)
+    # remove any data that was already there; we want to replace it.
+    if os.path.exists("data"):
+        shutil.rmtree("data")
+    rows = load_dataset(
+        "airtrain-ai/fineweb-edu-fortified",
+        name=CRAWL_DUMP,
+        split="train",
+        streaming=True,
+    )
+    print("Loading data")
+    # at this point you could iterate over the rows.
+    # Here, we'll take a sample of rows with size
+    # MAX_N_ROWS. Using islice will load only the amount
+    # we asked for and no extras.
+    sample = islice(rows, MAX_N_ROWS)
+    table = None
+    n_rows_loaded = 0
+    while True:
+        batch = list(islice(sample, N_ROWS_BATCH))
+        if len(batch) == 0:
+            break
+        # We'll put it in a vector DB for easy vector search.
+        # rename "embedding" column to "vector"
+        data = [rename_embedding_column(row) for row in batch]
+        n_rows_loaded += len(data)
+        if table is None:
+            print("Creating table")
+            table = db().create_table("data", data=data)
+            # index the embedding column for fast search.
+            print("Indexing table")
+            table.create_index(num_sub_vectors=1)
+        else:
+            table.add(data)
+        print(f"Loaded {n_rows_loaded} rows")
+    print("Done loading data")
+def search(search_phrase: str) -> tuple[pl.DataFrame, int]:
+    while "data" not in db().table_names():
+        # Data is loaded asynchronously. Make sure there is at least
+        # some in the table before searching.
+        time.sleep(1)
+    # Create our search vector
+    embedding = EMBEDDING_MODEL.encode([search_phrase])[0]
+    # Search
+    table = db().open_table("data")
+    data_frame = table.search(embedding).limit(N_SEARCH_RESULTS).to_polars()
+    return (
+        # Return only what we want to display
+        data_frame.select(*[pl.col(c) for c in DISPLAY_COLUMNS]).to_pandas(),
+        table.count_rows(),
+    )
+with gr.Blocks(css=STYLE) as demo:
+    gr.HTML(f"<style>{STYLE}</style>")
+    with gr.Row():
+        gr.Markdown(read_header_markdown())
+    with gr.Row():
+        input_text = gr.Textbox(label="Search phrase", scale=100)
+        search_button = gr.Button("Search", scale=1, min_width=100)
+    with gr.Row():
+        rows_searched = gr.Number(
+            label="Rows searched",
+            show_label=True,
+        )
+    with gr.Row():
+        search_results = gr.DataFrame(
+            headers=DISPLAY_COLUMNS,
+            type="pandas",
+            datatype=DISPLAY_COLUMN_TYPES,
+            row_count=N_SEARCH_RESULTS,
+            col_count=(len(DISPLAY_COLUMNS), "fixed"),
+            column_widths=DISPLAY_COLUMN_WIDTHS,
+            elem_classes=".df-text-col",
+        )
+    search_button.click(
+        search,
+        [input_text],
+        [search_results, rows_searched],
+    )
+# load data on another thread so we can start searching even before it's
+# all loaded.
+data_load_thread = Thread(target=load_data_sample, daemon=True)
+data_load_thread.start()
+print("Launching app")
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+datasets==2.20.0
+lancedb==0.12.0
+sentence-transformers==3.0.1
+polars==1.4.1