Hu Xu commited on
Commit
b94cb82
1 Parent(s): 51d9840

Add application file

Browse files
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ entry_count = None
5
+ metadata = None
6
+
7
+ def init_demo():
8
+ import json
9
+ import numpy as np
10
+
11
+ global metadata
12
+ with open("metadata.json") as f:
13
+ metadata = json.load(f)
14
+
15
+ # entry counts for our 1.6B(pool) -> 400M(curated); please check balance_sampling:main and substr match and count on your own data.
16
+ with open("metaclip/entry_counts_400m.json") as f:
17
+ entry_count_json = json.load(f)
18
+ global entry_count
19
+ entry_count = np.array([entry_count_json[entry] for entry in metadata], dtype=np.uint64) # uint64 to be safe for scaling.
20
+
21
+
22
+ def curation(text):
23
+ import sys
24
+ sys.path.append("./")
25
+ from metaclip.substr_matching import substr_matching
26
+ from metaclip.balancing import balance_sampling
27
+
28
+ t = 20000 # TODO: make this part of the UI
29
+ entry_count[entry_count < t] = t
30
+ entry_prob = t / entry_count
31
+
32
+ matched_entry_ids = substr_matching(text, metadata)
33
+ curation_prob = min(entry_prob[matched_entry_ids].sum(), 1.0)
34
+ curated = balance_sampling(matched_entry_ids, entry_prob)
35
+
36
+ return f"curation_prob={curation_prob:.3f}, curated={curated}"
37
+
38
+
39
+ init_demo()
40
+
41
+ demo = gr.Interface(fn=curation, inputs="text", outputs="text")
42
+
43
+ if __name__ == "__main__":
44
+ demo.launch(show_api=False)
metaclip/__pycache__/balancing.cpython-310.pyc ADDED
Binary file (445 Bytes). View file
 
metaclip/__pycache__/substr_matching.cpython-310.pyc ADDED
Binary file (798 Bytes). View file
 
metaclip/balancing.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+
3
+ import json
4
+ import numpy as np
5
+ import os
6
+ import random
7
+
8
+ from tqdm import tqdm
9
+
10
+
11
+ def balance_sampling(matched_entry_ids, entry_prob):
12
+ # this can be placed in a pipeline or on-the-fly in a data loader.
13
+ # see a numpy impl. at metaclip.indexing.balance_sampling.balance_sampling
14
+ for entry_id in matched_entry_ids:
15
+ if random.random() < entry_prob[entry_id]:
16
+ return True
17
+ return False
metaclip/entry_counts_400m.json ADDED
The diff for this file is too large to render. See raw diff
 
metaclip/substr_matching.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+
3
+
4
+ spaced_metadata = None
5
+
6
+ def spacing(text):
7
+ puncts_to_wrap = [",", ".", ";", ":", "?", "!", "`"]
8
+ chars_to_space = ["\t", "\n", "\r"]
9
+
10
+ spaced_text = f" {text} "
11
+ for punct_to_wrap in puncts_to_wrap:
12
+ spaced_text = spaced_text.replace(
13
+ punct_to_wrap, f" {punct_to_wrap} "
14
+ )
15
+ for char_to_space in chars_to_space:
16
+ spaced_text = spaced_text.replace(char_to_space, " ")
17
+ return spaced_text
18
+
19
+
20
+ def substr_matching(text, metadata):
21
+ global spaced_metadata
22
+ if spaced_metadata is None:
23
+ spaced_metadata = []
24
+ for entry in metadata:
25
+ spaced_metadata.append(f" {entry} ")
26
+ text = spacing(text)
27
+ matched_entry_ids = []
28
+ for entry_id, entry in enumerate(spaced_metadata):
29
+ if entry in text:
30
+ matched_entry_ids.append(entry_id)
31
+ return matched_entry_ids
metadata.json ADDED
The diff for this file is too large to render. See raw diff