josh-sematic commited on
Commit
c736f27
Β·
1 Parent(s): e1e3da4
Files changed (4) hide show
  1. .gitignore +3 -0
  2. README.md +25 -7
  3. app.py +174 -0
  4. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv
2
+ .csv
3
+ data
README.md CHANGED
@@ -1,13 +1,31 @@
1
  ---
2
- title: Fineweb Edu Fortified Search Demo
3
- emoji: πŸƒ
4
- colorFrom: indigo
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
 
 
 
 
11
  ---
 
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Fineweb-edu-fortified Semantic Search Demo
3
+ emoji: πŸ“š
 
 
4
  sdk: gradio
5
+ sdk_version: 4.31.5
6
  app_file: app.py
7
  pinned: false
8
+ datasets:
9
+ - airtrain-ai/fineweb-edu-fortified
10
+ - HuggingFaceFW/fineweb-edu
11
+ models:
12
+ - TaylorAI/bge-micro
13
  ---
14
+ # Semantic Search on Fineweb-edu-fortified sample
15
 
16
+ This performs semantic search on one crawl ({{CRAWL_DUMP}}) from Fineweb-edu-fortified.
17
+ It is intended to illustrate the contents of
18
+ [fineweb-edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu)
19
+ and
20
+ [fineweb-edu-fortified](https://huggingface.co/datasets/airtrain-ai/fineweb-edu-fortified).
21
+ To explore Fineweb-edu-fortified further, you can view automatic clustering, embedding
22
+ projections, and more for a 500k row sample using
23
+ [this Airtrain dashboard](https://app.airtrain.ai/dataset/c232b33f-4f4a-49a7-ba55-8167a5f433da/null/1/0).
24
+
25
+ The embeddings are the ones present in the dataset itself, and the same embedding model
26
+ is used to embed your search phrase. The search is performed using the 15 rows with the
27
+ closest embedding vectors to the embedding of the search phrase.
28
+
29
+ The search data is lazily loaded, so shortly after
30
+ the space is launched it may not yet have the full corpus of text from that crawl available
31
+ for search. Refer to 'Rows searched' to see how many rows were searched across to retrieve the results.
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from itertools import islice
4
+ import shutil
5
+ from threading import Thread
6
+
7
+ import lancedb
8
+ import gradio as gr
9
+ import polars as pl
10
+ from datasets import load_dataset
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+
14
+ STYLE = """
15
+ .gradio-container td span {
16
+ overflow: auto !important;
17
+ }
18
+ """.strip()
19
+
20
+ #
21
+ EMBEDDING_MODEL = SentenceTransformer("TaylorAI/bge-micro")
22
+
23
+ MAX_N_ROWS = 3_000_000
24
+ N_ROWS_BATCH = 5_000
25
+ N_SEARCH_RESULTS = 15
26
+ CRAWL_DUMP = "CC-MAIN-2020-05"
27
+ DB = None
28
+ DISPLAY_COLUMNS = [
29
+ "text",
30
+ "url",
31
+ "token_count",
32
+ "count",
33
+ ]
34
+ DISPLAY_COLUMN_TYPES = [
35
+ "str",
36
+ "str",
37
+ "number",
38
+ "number",
39
+ ]
40
+ DISPLAY_COLUMN_WIDTHS = [
41
+ "300px",
42
+ "100px",
43
+ "50px",
44
+ "25px",
45
+ ]
46
+
47
+ def rename_embedding_column(row):
48
+ vector = row["embedding"]
49
+ row["vector"] = vector
50
+ del row["embedding"]
51
+ return row
52
+
53
+
54
+ def read_header_markdown() -> str:
55
+ with open("./README.md", "r") as fp:
56
+ text = fp.read(-1)
57
+
58
+ # Get only the markdown following the HF metadata section.
59
+ text = text.split("\n---\n")[-1]
60
+ return text.replace("{{CRAWL_DUMP}}", CRAWL_DUMP)
61
+
62
+
63
+ def db():
64
+ global DB
65
+ if DB is None:
66
+ DB = lancedb.connect("data")
67
+ return DB
68
+
69
+ def load_data_sample():
70
+ time.sleep(5)
71
+
72
+ # remove any data that was already there; we want to replace it.
73
+ if os.path.exists("data"):
74
+ shutil.rmtree("data")
75
+
76
+ rows = load_dataset(
77
+ "airtrain-ai/fineweb-edu-fortified",
78
+ name=CRAWL_DUMP,
79
+ split="train",
80
+ streaming=True,
81
+ )
82
+
83
+ print("Loading data")
84
+
85
+ # at this point you could iterate over the rows.
86
+ # Here, we'll take a sample of rows with size
87
+ # MAX_N_ROWS. Using islice will load only the amount
88
+ # we asked for and no extras.
89
+ sample = islice(rows, MAX_N_ROWS)
90
+
91
+ table = None
92
+ n_rows_loaded = 0
93
+ while True:
94
+ batch = list(islice(sample, N_ROWS_BATCH))
95
+ if len(batch) == 0:
96
+ break
97
+
98
+ # We'll put it in a vector DB for easy vector search.
99
+ # rename "embedding" column to "vector"
100
+ data = [rename_embedding_column(row) for row in batch]
101
+ n_rows_loaded += len(data)
102
+
103
+ if table is None:
104
+ print("Creating table")
105
+ table = db().create_table("data", data=data)
106
+
107
+ # index the embedding column for fast search.
108
+ print("Indexing table")
109
+ table.create_index(num_sub_vectors=1)
110
+ else:
111
+ table.add(data)
112
+
113
+ print(f"Loaded {n_rows_loaded} rows")
114
+ print("Done loading data")
115
+
116
+
117
+
118
+ def search(search_phrase: str) -> tuple[pl.DataFrame, int]:
119
+ while "data" not in db().table_names():
120
+ # Data is loaded asynchronously. Make sure there is at least
121
+ # some in the table before searching.
122
+ time.sleep(1)
123
+
124
+ # Create our search vector
125
+ embedding = EMBEDDING_MODEL.encode([search_phrase])[0]
126
+
127
+ # Search
128
+ table = db().open_table("data")
129
+ data_frame = table.search(embedding).limit(N_SEARCH_RESULTS).to_polars()
130
+
131
+ return (
132
+ # Return only what we want to display
133
+ data_frame.select(*[pl.col(c) for c in DISPLAY_COLUMNS]).to_pandas(),
134
+ table.count_rows(),
135
+ )
136
+
137
+
138
+
139
+ with gr.Blocks(css=STYLE) as demo:
140
+ gr.HTML(f"<style>{STYLE}</style>")
141
+ with gr.Row():
142
+ gr.Markdown(read_header_markdown())
143
+ with gr.Row():
144
+ input_text = gr.Textbox(label="Search phrase", scale=100)
145
+ search_button = gr.Button("Search", scale=1, min_width=100)
146
+ with gr.Row():
147
+ rows_searched = gr.Number(
148
+ label="Rows searched",
149
+ show_label=True,
150
+ )
151
+ with gr.Row():
152
+ search_results = gr.DataFrame(
153
+ headers=DISPLAY_COLUMNS,
154
+ type="pandas",
155
+ datatype=DISPLAY_COLUMN_TYPES,
156
+ row_count=N_SEARCH_RESULTS,
157
+ col_count=(len(DISPLAY_COLUMNS), "fixed"),
158
+ column_widths=DISPLAY_COLUMN_WIDTHS,
159
+ elem_classes=".df-text-col",
160
+ )
161
+ search_button.click(
162
+ search,
163
+ [input_text],
164
+ [search_results, rows_searched],
165
+ )
166
+
167
+
168
+ # load data on another thread so we can start searching even before it's
169
+ # all loaded.
170
+ data_load_thread = Thread(target=load_data_sample, daemon=True)
171
+ data_load_thread.start()
172
+
173
+ print("Launching app")
174
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ datasets==2.20.0
2
+ lancedb==0.12.0
3
+ sentence-transformers==3.0.1
4
+ polars==1.4.1