Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -993,27 +993,75 @@ def refresh_data():
|
|
| 993 |
|
| 994 |
|
| 995 |
target_datasets = {
|
| 996 |
-
"aiqtech/kolaw": "https://huggingface.co/datasets/aiqtech/kolaw"
|
| 997 |
-
|
| 998 |
-
"nlpai-lab/kowiki": "https://huggingface.co/datasets/nlpai-lab/kowiki",
|
| 999 |
-
"KETI-AIR/korpora": "https://huggingface.co/datasets/KETI-AIR/korpora",
|
| 1000 |
-
"heegyu/korean-parallel-corpora": "https://huggingface.co/datasets/heegyu/korean-parallel-corpora",
|
| 1001 |
-
"heegyu/korean-hate-speech": "https://huggingface.co/datasets/heegyu/korean-hate-speech",
|
| 1002 |
-
"KETI-AIR/korean-parallel-corpora": "https://huggingface.co/datasets/KETI-AIR/korean-parallel-corpora",
|
| 1003 |
-
"heegyu/korean-chatbot-data": "https://huggingface.co/datasets/heegyu/korean-chatbot-data",
|
| 1004 |
-
"heegyu/korean-qa": "https://huggingface.co/datasets/heegyu/korean-qa",
|
| 1005 |
-
"heegyu/korean-summarization": "https://huggingface.co/datasets/heegyu/korean-summarization",
|
| 1006 |
-
"nlpai-lab/kullm-chat-v2": "https://huggingface.co/datasets/nlpai-lab/kullm-chat-v2",
|
| 1007 |
-
"upstage/open-ko-llm-leaderboard": "https://huggingface.co/datasets/upstage/open-ko-llm-leaderboard"
|
| 1008 |
}
|
| 1009 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1010 |
def get_datasets_data(progress=gr.Progress()):
|
| 1011 |
def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
|
| 1012 |
-
# 글로벌 순위 확인
|
| 1013 |
global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
|
| 1014 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
| 1015 |
|
| 1016 |
-
# Korea 데이터셋인 경우
|
| 1017 |
is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
|
| 1018 |
|
| 1019 |
if is_korea:
|
|
@@ -1039,37 +1087,9 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1039 |
empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
|
| 1040 |
return fig, error_html, empty_df
|
| 1041 |
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
'limit': 3000,
|
| 1045 |
-
'full': 'true'
|
| 1046 |
-
}
|
| 1047 |
-
|
| 1048 |
-
all_datasets_response = requests.get(
|
| 1049 |
-
"https://huggingface.co/api/datasets",
|
| 1050 |
-
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
| 1051 |
-
params=params
|
| 1052 |
-
)
|
| 1053 |
-
|
| 1054 |
-
korea_params = {
|
| 1055 |
-
'search': 'korea',
|
| 1056 |
-
'limit': 3000,
|
| 1057 |
-
'full': 'true'
|
| 1058 |
-
}
|
| 1059 |
|
| 1060 |
-
korea_datasets_response = requests.get(
|
| 1061 |
-
"https://huggingface.co/api/datasets",
|
| 1062 |
-
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
| 1063 |
-
params=korea_params
|
| 1064 |
-
)
|
| 1065 |
-
|
| 1066 |
-
all_global_datasets = all_datasets_response.json()
|
| 1067 |
-
korea_datasets = korea_datasets_response.json()
|
| 1068 |
-
|
| 1069 |
-
# 시각화를 위한 Figure 생성
|
| 1070 |
-
fig = go.Figure()
|
| 1071 |
-
|
| 1072 |
-
# 순위 정보 수집
|
| 1073 |
filtered_datasets = []
|
| 1074 |
for dataset_id in target_datasets.keys():
|
| 1075 |
try:
|
|
@@ -1092,8 +1112,6 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1092 |
'title': dataset_data.get('title', 'No Title'),
|
| 1093 |
'is_korea': is_korea
|
| 1094 |
})
|
| 1095 |
-
|
| 1096 |
-
print(f"Dataset {dataset_id}: Rank={rank}, Is Korea={is_korea}")
|
| 1097 |
else:
|
| 1098 |
filtered_datasets.append({
|
| 1099 |
'id': dataset_id,
|
|
@@ -1107,12 +1125,13 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1107 |
print(f"Error processing {dataset_id}: {str(e)}")
|
| 1108 |
continue
|
| 1109 |
|
| 1110 |
-
# 순위로 정렬
|
| 1111 |
filtered_datasets.sort(key=lambda x: float('inf') if isinstance(x['global_rank'], str) else x['global_rank'])
|
| 1112 |
|
| 1113 |
# 시각화 데이터 준비
|
| 1114 |
valid_datasets = [d for d in filtered_datasets if isinstance(d['global_rank'], (int, float))]
|
| 1115 |
|
|
|
|
|
|
|
| 1116 |
if valid_datasets:
|
| 1117 |
ids = [d['id'] for d in valid_datasets]
|
| 1118 |
ranks = [d['global_rank'] for d in valid_datasets]
|
|
@@ -1120,10 +1139,12 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1120 |
fig.add_trace(go.Bar(
|
| 1121 |
x=ids,
|
| 1122 |
y=[3001 - r for r in ranks],
|
| 1123 |
-
text=[f"Rank: #{r}<br>
|
|
|
|
|
|
|
| 1124 |
for r, d in zip(ranks, valid_datasets)],
|
| 1125 |
textposition='auto',
|
| 1126 |
-
marker_color=['rgba(255,0,0,0.6)' if d['is_korea'] else 'rgba(0,0,255,0.6)'
|
| 1127 |
for d in valid_datasets],
|
| 1128 |
opacity=0.8
|
| 1129 |
))
|
|
@@ -1198,8 +1219,6 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1198 |
} for d in filtered_datasets])
|
| 1199 |
|
| 1200 |
progress(1.0, desc="Complete!")
|
| 1201 |
-
|
| 1202 |
-
|
| 1203 |
return fig, html_content, df
|
| 1204 |
|
| 1205 |
except Exception as e:
|
|
|
|
| 993 |
|
| 994 |
|
| 995 |
target_datasets = {
|
| 996 |
+
"aiqtech/kolaw": "https://huggingface.co/datasets/aiqtech/kolaw"
|
| 997 |
+
# 필요한 데이터셋 추가
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 998 |
}
|
| 999 |
|
| 1000 |
+
def get_korea_datasets():
|
| 1001 |
+
"""Korea 관련 데이터셋 검색"""
|
| 1002 |
+
params = {
|
| 1003 |
+
"search": "korea",
|
| 1004 |
+
"full": "True",
|
| 1005 |
+
"limit": 1000
|
| 1006 |
+
}
|
| 1007 |
+
|
| 1008 |
+
try:
|
| 1009 |
+
response = requests.get(
|
| 1010 |
+
"https://huggingface.co/api/datasets",
|
| 1011 |
+
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
| 1012 |
+
params=params
|
| 1013 |
+
)
|
| 1014 |
+
|
| 1015 |
+
if response.status_code == 200:
|
| 1016 |
+
return response.json()
|
| 1017 |
+
else:
|
| 1018 |
+
print(f"Failed to fetch Korea datasets: {response.status_code}")
|
| 1019 |
+
return []
|
| 1020 |
+
except Exception as e:
|
| 1021 |
+
print(f"Error fetching Korea datasets: {str(e)}")
|
| 1022 |
+
return []
|
| 1023 |
+
|
| 1024 |
+
def get_all_datasets(limit=3000):
|
| 1025 |
+
"""모든 데이터셋과 Korea 관련 데이터셋 가져오기"""
|
| 1026 |
+
all_datasets = []
|
| 1027 |
+
page_size = 1000
|
| 1028 |
+
|
| 1029 |
+
for offset in range(0, limit, page_size):
|
| 1030 |
+
params = {
|
| 1031 |
+
'limit': min(page_size, limit - offset),
|
| 1032 |
+
'full': 'True',
|
| 1033 |
+
'offset': offset
|
| 1034 |
+
}
|
| 1035 |
+
|
| 1036 |
+
response = requests.get(
|
| 1037 |
+
"https://huggingface.co/api/datasets",
|
| 1038 |
+
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
| 1039 |
+
params=params
|
| 1040 |
+
)
|
| 1041 |
+
|
| 1042 |
+
if response.status_code == 200:
|
| 1043 |
+
all_datasets.extend(response.json())
|
| 1044 |
+
print(f"Fetched datasets {offset+1} to {offset+len(response.json())}")
|
| 1045 |
+
else:
|
| 1046 |
+
print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
|
| 1047 |
+
break
|
| 1048 |
+
|
| 1049 |
+
# Korea 검색 결과 추가
|
| 1050 |
+
korea_datasets = get_korea_datasets()
|
| 1051 |
+
existing_ids = {dataset.get('id', '') for dataset in all_datasets}
|
| 1052 |
+
|
| 1053 |
+
for korea_dataset in korea_datasets:
|
| 1054 |
+
if korea_dataset.get('id', '') not in existing_ids:
|
| 1055 |
+
all_datasets.append(korea_dataset)
|
| 1056 |
+
existing_ids.add(korea_dataset.get('id', ''))
|
| 1057 |
+
|
| 1058 |
+
return all_datasets[:limit]
|
| 1059 |
+
|
| 1060 |
def get_datasets_data(progress=gr.Progress()):
|
| 1061 |
def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
|
|
|
|
| 1062 |
global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
|
| 1063 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
| 1064 |
|
|
|
|
| 1065 |
is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
|
| 1066 |
|
| 1067 |
if is_korea:
|
|
|
|
| 1087 |
empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
|
| 1088 |
return fig, error_html, empty_df
|
| 1089 |
|
| 1090 |
+
all_global_datasets = get_all_datasets(limit=3000)
|
| 1091 |
+
korea_datasets = get_korea_datasets()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1092 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1093 |
filtered_datasets = []
|
| 1094 |
for dataset_id in target_datasets.keys():
|
| 1095 |
try:
|
|
|
|
| 1112 |
'title': dataset_data.get('title', 'No Title'),
|
| 1113 |
'is_korea': is_korea
|
| 1114 |
})
|
|
|
|
|
|
|
| 1115 |
else:
|
| 1116 |
filtered_datasets.append({
|
| 1117 |
'id': dataset_id,
|
|
|
|
| 1125 |
print(f"Error processing {dataset_id}: {str(e)}")
|
| 1126 |
continue
|
| 1127 |
|
|
|
|
| 1128 |
filtered_datasets.sort(key=lambda x: float('inf') if isinstance(x['global_rank'], str) else x['global_rank'])
|
| 1129 |
|
| 1130 |
# 시각화 데이터 준비
|
| 1131 |
valid_datasets = [d for d in filtered_datasets if isinstance(d['global_rank'], (int, float))]
|
| 1132 |
|
| 1133 |
+
fig = go.Figure()
|
| 1134 |
+
|
| 1135 |
if valid_datasets:
|
| 1136 |
ids = [d['id'] for d in valid_datasets]
|
| 1137 |
ranks = [d['global_rank'] for d in valid_datasets]
|
|
|
|
| 1139 |
fig.add_trace(go.Bar(
|
| 1140 |
x=ids,
|
| 1141 |
y=[3001 - r for r in ranks],
|
| 1142 |
+
text=[f"Rank: #{r}<br>{'🇰🇷 Korea Dataset<br>' if d['is_korea'] else ''}"
|
| 1143 |
+
f"Downloads: {format(d['downloads'], ',')}<br>"
|
| 1144 |
+
f"Likes: {format(d['likes'], ',')}"
|
| 1145 |
for r, d in zip(ranks, valid_datasets)],
|
| 1146 |
textposition='auto',
|
| 1147 |
+
marker_color=['rgba(255,0,0,0.6)' if d['is_korea'] else 'rgba(0,0,255,0.6)'
|
| 1148 |
for d in valid_datasets],
|
| 1149 |
opacity=0.8
|
| 1150 |
))
|
|
|
|
| 1219 |
} for d in filtered_datasets])
|
| 1220 |
|
| 1221 |
progress(1.0, desc="Complete!")
|
|
|
|
|
|
|
| 1222 |
return fig, html_content, df
|
| 1223 |
|
| 1224 |
except Exception as e:
|