Spaces:
Running
Running
DmitriiKhizbullin
commited on
Commit
·
b25fb44
1
Parent(s):
a59a803
Sync with the main repo
Browse files- apps/common/auto_zip.py +53 -0
- apps/data_explorer/data_explorer.py +16 -2
- apps/data_explorer/downloader.py +15 -1
- apps/data_explorer/loader.py +24 -35
- sync.sh +15 -0
apps/common/auto_zip.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
2 |
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
3 |
+
# you may not use this file except in compliance with the License.
|
4 |
+
# You may obtain a copy of the License at
|
5 |
+
#
|
6 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7 |
+
#
|
8 |
+
# Unless required by applicable law or agreed to in writing, software
|
9 |
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
10 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11 |
+
# See the License for the specific language governing permissions and
|
12 |
+
# limitations under the License.
|
13 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
14 |
+
import json
|
15 |
+
import os
|
16 |
+
import zipfile
|
17 |
+
|
18 |
+
|
19 |
+
class AutoZip:
|
20 |
+
|
21 |
+
def __init__(self, zip_path: str, ext: str = ".json"):
|
22 |
+
self.zip_path = zip_path
|
23 |
+
self.zip = zipfile.ZipFile(zip_path, "r")
|
24 |
+
self.fl = [f for f in self.zip.filelist if f.filename.endswith(ext)]
|
25 |
+
|
26 |
+
def __next__(self):
|
27 |
+
if self.index >= len(self.fl):
|
28 |
+
raise StopIteration
|
29 |
+
else:
|
30 |
+
finfo = self.fl[self.index]
|
31 |
+
with self.zip.open(finfo) as f:
|
32 |
+
raw_json = json.loads(f.read().decode("utf-8"))
|
33 |
+
self.index += 1
|
34 |
+
return raw_json
|
35 |
+
|
36 |
+
def __len__(self):
|
37 |
+
return len(self.fl)
|
38 |
+
|
39 |
+
def __iter__(self):
|
40 |
+
self.index = 0
|
41 |
+
return self
|
42 |
+
|
43 |
+
def as_dict(self, include_zip_name: bool = False):
|
44 |
+
d = dict()
|
45 |
+
for finfo in self.fl:
|
46 |
+
with self.zip.open(finfo) as f:
|
47 |
+
raw_text = f.read().decode("utf-8")
|
48 |
+
if include_zip_name:
|
49 |
+
key = os.path.split(self.zip_path)[1] + "/" + finfo.filename
|
50 |
+
else:
|
51 |
+
key = finfo.filename
|
52 |
+
d[key] = raw_text
|
53 |
+
return d
|
apps/data_explorer/data_explorer.py
CHANGED
@@ -1,3 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
Gradio-based web UI to explore the Camel dataset.
|
3 |
"""
|
@@ -39,7 +52,8 @@ def parse_arguments():
|
|
39 |
return args
|
40 |
|
41 |
|
42 |
-
def construct_ui(blocks, datasets: Datasets,
|
|
|
43 |
""" Build Gradio UI and populate with chat data from JSONs.
|
44 |
|
45 |
Args:
|
@@ -213,7 +227,7 @@ def construct_ui(blocks, datasets: Datasets, default_dataset: str = None):
|
|
213 |
Returns:
|
214 |
List[Tuple]: Chat history in chatbot UI element format.
|
215 |
"""
|
216 |
-
history = []
|
217 |
curr_qa = (None, None)
|
218 |
for k in sorted(messages.keys()):
|
219 |
msg = messages[k]
|
|
|
1 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
2 |
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
3 |
+
# you may not use this file except in compliance with the License.
|
4 |
+
# You may obtain a copy of the License at
|
5 |
+
#
|
6 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7 |
+
#
|
8 |
+
# Unless required by applicable law or agreed to in writing, software
|
9 |
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
10 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11 |
+
# See the License for the specific language governing permissions and
|
12 |
+
# limitations under the License.
|
13 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
14 |
"""
|
15 |
Gradio-based web UI to explore the Camel dataset.
|
16 |
"""
|
|
|
52 |
return args
|
53 |
|
54 |
|
55 |
+
def construct_ui(blocks, datasets: Datasets,
|
56 |
+
default_dataset: Optional[str] = None):
|
57 |
""" Build Gradio UI and populate with chat data from JSONs.
|
58 |
|
59 |
Args:
|
|
|
227 |
Returns:
|
228 |
List[Tuple]: Chat history in chatbot UI element format.
|
229 |
"""
|
230 |
+
history: List[Tuple] = []
|
231 |
curr_qa = (None, None)
|
232 |
for k in sorted(messages.keys()):
|
233 |
msg = messages[k]
|
apps/data_explorer/downloader.py
CHANGED
@@ -1,7 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import urllib.request
|
3 |
|
4 |
from huggingface_hub import hf_hub_download
|
|
|
5 |
|
6 |
REPO_ROOT = os.path.realpath(
|
7 |
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
|
@@ -23,7 +37,7 @@ def download_data():
|
|
23 |
hf_hub_download(repo_id="camel-ai/code", repo_type="dataset",
|
24 |
filename="code_chat.zip", local_dir=data_dir,
|
25 |
local_dir_use_symlinks=False)
|
26 |
-
except:
|
27 |
for name in ("ai_society_chat.zip", "code_chat.zip"):
|
28 |
data_url = ("https://storage.googleapis.com/"
|
29 |
f"camel-bucket/datasets/private/{name}")
|
|
|
1 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
2 |
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
3 |
+
# you may not use this file except in compliance with the License.
|
4 |
+
# You may obtain a copy of the License at
|
5 |
+
#
|
6 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7 |
+
#
|
8 |
+
# Unless required by applicable law or agreed to in writing, software
|
9 |
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
10 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11 |
+
# See the License for the specific language governing permissions and
|
12 |
+
# limitations under the License.
|
13 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
14 |
import os
|
15 |
import urllib.request
|
16 |
|
17 |
from huggingface_hub import hf_hub_download
|
18 |
+
from huggingface_hub.utils._errors import RepositoryNotFoundError
|
19 |
|
20 |
REPO_ROOT = os.path.realpath(
|
21 |
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
|
|
|
37 |
hf_hub_download(repo_id="camel-ai/code", repo_type="dataset",
|
38 |
filename="code_chat.zip", local_dir=data_dir,
|
39 |
local_dir_use_symlinks=False)
|
40 |
+
except RepositoryNotFoundError:
|
41 |
for name in ("ai_society_chat.zip", "code_chat.zip"):
|
42 |
data_url = ("https://storage.googleapis.com/"
|
43 |
f"camel-bucket/datasets/private/{name}")
|
apps/data_explorer/loader.py
CHANGED
@@ -1,16 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
Everything related to parsing the data JSONs into UI-compatible format.
|
3 |
"""
|
4 |
|
5 |
import glob
|
6 |
-
import json
|
7 |
import os
|
8 |
import re
|
9 |
-
import
|
10 |
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
11 |
|
12 |
from tqdm import tqdm
|
13 |
|
|
|
|
|
14 |
ChatHistory = Dict[str, Any]
|
15 |
ParsedChatHistory = Dict[str, Any]
|
16 |
AllChats = Dict[str, Any]
|
@@ -20,30 +33,6 @@ REPO_ROOT = os.path.realpath(
|
|
20 |
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
|
21 |
|
22 |
|
23 |
-
class AutoZip:
|
24 |
-
def __init__(self, zip_path: str, ext: str = ".json"):
|
25 |
-
self.zip_path = zip_path
|
26 |
-
self.zip = zipfile.ZipFile(zip_path, "r")
|
27 |
-
self.fl = [f for f in self.zip.filelist if f.filename.endswith(ext)]
|
28 |
-
|
29 |
-
def __next__(self):
|
30 |
-
if self.index >= len(self.fl):
|
31 |
-
raise StopIteration
|
32 |
-
else:
|
33 |
-
finfo = self.fl[self.index]
|
34 |
-
with self.zip.open(finfo) as f:
|
35 |
-
raw_json = json.loads(f.read().decode("utf-8"))
|
36 |
-
self.index += 1
|
37 |
-
return raw_json
|
38 |
-
|
39 |
-
def __len__(self):
|
40 |
-
return len(self.fl)
|
41 |
-
|
42 |
-
def __iter__(self):
|
43 |
-
self.index = 0
|
44 |
-
return self
|
45 |
-
|
46 |
-
|
47 |
def parse(raw_chat: ChatHistory) -> Union[ParsedChatHistory, None]:
|
48 |
""" Gets the JSON raw chat data, validates it and transforms
|
49 |
into an easy to work with form.
|
@@ -122,17 +111,17 @@ def load_zip(zip_path: str) -> AllChats:
|
|
122 |
continue
|
123 |
parsed_list.append(parsed)
|
124 |
|
125 |
-
|
126 |
-
|
127 |
for parsed in parsed_list:
|
128 |
-
|
129 |
-
|
130 |
-
assistant_roles = list(sorted(
|
131 |
-
user_roles = list(sorted(
|
132 |
-
matrix: Dict[Tuple[str, str],
|
133 |
for parsed in parsed_list:
|
134 |
key = (parsed['assistant_role'], parsed['user_role'])
|
135 |
-
original_task = parsed['original_task']
|
136 |
new_item = {
|
137 |
k: v
|
138 |
for k, v in parsed.items()
|
|
|
1 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
2 |
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
3 |
+
# you may not use this file except in compliance with the License.
|
4 |
+
# You may obtain a copy of the License at
|
5 |
+
#
|
6 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7 |
+
#
|
8 |
+
# Unless required by applicable law or agreed to in writing, software
|
9 |
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
10 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11 |
+
# See the License for the specific language governing permissions and
|
12 |
+
# limitations under the License.
|
13 |
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
14 |
"""
|
15 |
Everything related to parsing the data JSONs into UI-compatible format.
|
16 |
"""
|
17 |
|
18 |
import glob
|
|
|
19 |
import os
|
20 |
import re
|
21 |
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
|
22 |
|
23 |
from tqdm import tqdm
|
24 |
|
25 |
+
from apps.common.auto_zip import AutoZip
|
26 |
+
|
27 |
ChatHistory = Dict[str, Any]
|
28 |
ParsedChatHistory = Dict[str, Any]
|
29 |
AllChats = Dict[str, Any]
|
|
|
33 |
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
def parse(raw_chat: ChatHistory) -> Union[ParsedChatHistory, None]:
|
37 |
""" Gets the JSON raw chat data, validates it and transforms
|
38 |
into an easy to work with form.
|
|
|
111 |
continue
|
112 |
parsed_list.append(parsed)
|
113 |
|
114 |
+
assistant_roles_set = set()
|
115 |
+
user_roles_set = set()
|
116 |
for parsed in parsed_list:
|
117 |
+
assistant_roles_set.add(parsed['assistant_role'])
|
118 |
+
user_roles_set.add(parsed['user_role'])
|
119 |
+
assistant_roles = list(sorted(assistant_roles_set))
|
120 |
+
user_roles = list(sorted(user_roles_set))
|
121 |
+
matrix: Dict[Tuple[str, str], Dict[str, Dict]] = dict()
|
122 |
for parsed in parsed_list:
|
123 |
key = (parsed['assistant_role'], parsed['user_role'])
|
124 |
+
original_task: str = parsed['original_task']
|
125 |
new_item = {
|
126 |
k: v
|
127 |
for k, v in parsed.items()
|
sync.sh
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TMP_DIR=/tmp/camel_hf_tmp
|
2 |
+
echo $TMP_DIR
|
3 |
+
HF_REPO_DIR=`realpath .`
|
4 |
+
echo $HF_REPO_DIR
|
5 |
+
|
6 |
+
mkdir -p $TMP_DIR
|
7 |
+
git clone -b hf_spaces_2 https://github.com/lightaime/camel.git $TMP_DIR
|
8 |
+
cd $TMP_DIR
|
9 |
+
|
10 |
+
find apps/data_explorer -name "*.py" | grep -v test | xargs -n 1 -I {} rsync -R {} $HF_REPO_DIR
|
11 |
+
find apps/common -name "*.py" | grep -v test | xargs -n 1 -I {} rsync -R {} $HF_REPO_DIR
|
12 |
+
|
13 |
+
rm -rf $TMP_DIR
|
14 |
+
|
15 |
+
echo Done
|