Hu Xu
commited on
Commit
•
b94cb82
1
Parent(s):
51d9840
Add application file
Browse files- app.py +44 -0
- metaclip/__pycache__/balancing.cpython-310.pyc +0 -0
- metaclip/__pycache__/substr_matching.cpython-310.pyc +0 -0
- metaclip/balancing.py +17 -0
- metaclip/entry_counts_400m.json +0 -0
- metaclip/substr_matching.py +31 -0
- metadata.json +0 -0
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
|
4 |
+
entry_count = None
|
5 |
+
metadata = None
|
6 |
+
|
7 |
+
def init_demo():
|
8 |
+
import json
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
global metadata
|
12 |
+
with open("metadata.json") as f:
|
13 |
+
metadata = json.load(f)
|
14 |
+
|
15 |
+
# entry counts for our 1.6B(pool) -> 400M(curated); please check balance_sampling:main and substr match and count on your own data.
|
16 |
+
with open("metaclip/entry_counts_400m.json") as f:
|
17 |
+
entry_count_json = json.load(f)
|
18 |
+
global entry_count
|
19 |
+
entry_count = np.array([entry_count_json[entry] for entry in metadata], dtype=np.uint64) # uint64 to be safe for scaling.
|
20 |
+
|
21 |
+
|
22 |
+
def curation(text):
|
23 |
+
import sys
|
24 |
+
sys.path.append("./")
|
25 |
+
from metaclip.substr_matching import substr_matching
|
26 |
+
from metaclip.balancing import balance_sampling
|
27 |
+
|
28 |
+
t = 20000 # TODO: make this part of the UI
|
29 |
+
entry_count[entry_count < t] = t
|
30 |
+
entry_prob = t / entry_count
|
31 |
+
|
32 |
+
matched_entry_ids = substr_matching(text, metadata)
|
33 |
+
curation_prob = min(entry_prob[matched_entry_ids].sum(), 1.0)
|
34 |
+
curated = balance_sampling(matched_entry_ids, entry_prob)
|
35 |
+
|
36 |
+
return f"curation_prob={curation_prob:.3f}, curated={curated}"
|
37 |
+
|
38 |
+
|
39 |
+
init_demo()
|
40 |
+
|
41 |
+
demo = gr.Interface(fn=curation, inputs="text", outputs="text")
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
demo.launch(show_api=False)
|
metaclip/__pycache__/balancing.cpython-310.pyc
ADDED
Binary file (445 Bytes). View file
|
|
metaclip/__pycache__/substr_matching.cpython-310.pyc
ADDED
Binary file (798 Bytes). View file
|
|
metaclip/balancing.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates
|
2 |
+
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
import os
|
6 |
+
import random
|
7 |
+
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
|
11 |
+
def balance_sampling(matched_entry_ids, entry_prob):
|
12 |
+
# this can be placed in a pipeline or on-the-fly in a data loader.
|
13 |
+
# see a numpy impl. at metaclip.indexing.balance_sampling.balance_sampling
|
14 |
+
for entry_id in matched_entry_ids:
|
15 |
+
if random.random() < entry_prob[entry_id]:
|
16 |
+
return True
|
17 |
+
return False
|
metaclip/entry_counts_400m.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
metaclip/substr_matching.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates
|
2 |
+
|
3 |
+
|
4 |
+
spaced_metadata = None
|
5 |
+
|
6 |
+
def spacing(text):
|
7 |
+
puncts_to_wrap = [",", ".", ";", ":", "?", "!", "`"]
|
8 |
+
chars_to_space = ["\t", "\n", "\r"]
|
9 |
+
|
10 |
+
spaced_text = f" {text} "
|
11 |
+
for punct_to_wrap in puncts_to_wrap:
|
12 |
+
spaced_text = spaced_text.replace(
|
13 |
+
punct_to_wrap, f" {punct_to_wrap} "
|
14 |
+
)
|
15 |
+
for char_to_space in chars_to_space:
|
16 |
+
spaced_text = spaced_text.replace(char_to_space, " ")
|
17 |
+
return spaced_text
|
18 |
+
|
19 |
+
|
20 |
+
def substr_matching(text, metadata):
|
21 |
+
global spaced_metadata
|
22 |
+
if spaced_metadata is None:
|
23 |
+
spaced_metadata = []
|
24 |
+
for entry in metadata:
|
25 |
+
spaced_metadata.append(f" {entry} ")
|
26 |
+
text = spacing(text)
|
27 |
+
matched_entry_ids = []
|
28 |
+
for entry_id, entry in enumerate(spaced_metadata):
|
29 |
+
if entry in text:
|
30 |
+
matched_entry_ids.append(entry_id)
|
31 |
+
return matched_entry_ids
|
metadata.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|