Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
0c05388
1
Parent(s):
7f54946
Use real population data in map
Browse files
app.py
CHANGED
|
@@ -251,9 +251,24 @@ def format_number(n):
|
|
| 251 |
return f"{n/1_000:.0f}K"
|
| 252 |
return str(n)
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
def create_world_map(results):
|
| 256 |
# Collect all country data
|
|
|
|
| 257 |
country_data = {}
|
| 258 |
for lang in results:
|
| 259 |
if "population" not in lang or lang["bleu"] is None:
|
|
@@ -270,6 +285,7 @@ def create_world_map(results):
|
|
| 270 |
if iso3_code not in country_data:
|
| 271 |
country_data[iso3_code] = {
|
| 272 |
"total_speakers": 0,
|
|
|
|
| 273 |
"weighted_bleu_sum": 0,
|
| 274 |
"languages": [],
|
| 275 |
}
|
|
@@ -324,7 +340,6 @@ def create_world_map(results):
|
|
| 324 |
|
| 325 |
# Sort languages by number of speakers
|
| 326 |
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
|
| 327 |
-
total_speakers = sum(lang["speakers"] for lang in langs)
|
| 328 |
|
| 329 |
# Take top 5 languages and summarize the rest
|
| 330 |
main_langs = langs[:5]
|
|
@@ -333,7 +348,7 @@ def create_world_map(results):
|
|
| 333 |
# Create language rows with bars
|
| 334 |
lang_rows = []
|
| 335 |
for lang in main_langs:
|
| 336 |
-
percentage = (lang["speakers"] /
|
| 337 |
speaker_bar = make_black_bar(percentage / 100)
|
| 338 |
bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
|
| 339 |
|
|
@@ -346,7 +361,7 @@ def create_world_map(results):
|
|
| 346 |
# Add summary for other languages if any
|
| 347 |
if other_langs:
|
| 348 |
other_speakers = sum(lang["speakers"] for lang in other_langs)
|
| 349 |
-
other_percentage = (other_speakers /
|
| 350 |
other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
|
| 351 |
other_langs
|
| 352 |
)
|
|
@@ -360,15 +375,8 @@ def create_world_map(results):
|
|
| 360 |
f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
|
| 361 |
)
|
| 362 |
|
| 363 |
-
# Create overall BLEU visualization
|
| 364 |
-
bleu_percentage = (weighted_avg - 0.2) / 0.2 # Scale from 0.2-0.4 to 0-1
|
| 365 |
-
overall_bleu_bar = make_colored_bar(bleu_percentage)
|
| 366 |
-
|
| 367 |
hover_text = (
|
| 368 |
f"<b>{country_name}</b><br><br>"
|
| 369 |
-
f"{format_number(data['total_speakers'])} speakers*<br>"
|
| 370 |
-
f"{overall_bleu_bar} {weighted_avg:.3f} BLEU<br><br>"
|
| 371 |
-
f"<b>Languages:</b><br><br>"
|
| 372 |
f"{'<br>'.join(lang_rows)}"
|
| 373 |
)
|
| 374 |
|
|
|
|
| 251 |
return f"{n/1_000:.0f}K"
|
| 252 |
return str(n)
|
| 253 |
|
| 254 |
+
def get_population_data():
|
| 255 |
+
import xml.etree.ElementTree as ET
|
| 256 |
+
from language_data.util import data_filename
|
| 257 |
+
|
| 258 |
+
filename = data_filename("supplementalData.xml")
|
| 259 |
+
root = ET.fromstring(open(filename).read())
|
| 260 |
+
territories = root.findall("./territoryInfo/territory")
|
| 261 |
+
|
| 262 |
+
data = {}
|
| 263 |
+
for territory in territories:
|
| 264 |
+
t_code = territory.attrib['type']
|
| 265 |
+
t_population = float(territory.attrib['population'])
|
| 266 |
+
data[t_code] = t_population
|
| 267 |
+
return data
|
| 268 |
|
| 269 |
def create_world_map(results):
|
| 270 |
# Collect all country data
|
| 271 |
+
population_data = get_population_data()
|
| 272 |
country_data = {}
|
| 273 |
for lang in results:
|
| 274 |
if "population" not in lang or lang["bleu"] is None:
|
|
|
|
| 285 |
if iso3_code not in country_data:
|
| 286 |
country_data[iso3_code] = {
|
| 287 |
"total_speakers": 0,
|
| 288 |
+
"population": population_data.get(country_code, 0),
|
| 289 |
"weighted_bleu_sum": 0,
|
| 290 |
"languages": [],
|
| 291 |
}
|
|
|
|
| 340 |
|
| 341 |
# Sort languages by number of speakers
|
| 342 |
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
|
|
|
|
| 343 |
|
| 344 |
# Take top 5 languages and summarize the rest
|
| 345 |
main_langs = langs[:5]
|
|
|
|
| 348 |
# Create language rows with bars
|
| 349 |
lang_rows = []
|
| 350 |
for lang in main_langs:
|
| 351 |
+
percentage = (lang["speakers"] / data["population"]) * 100
|
| 352 |
speaker_bar = make_black_bar(percentage / 100)
|
| 353 |
bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
|
| 354 |
|
|
|
|
| 361 |
# Add summary for other languages if any
|
| 362 |
if other_langs:
|
| 363 |
other_speakers = sum(lang["speakers"] for lang in other_langs)
|
| 364 |
+
other_percentage = (other_speakers / data["population"]) * 100
|
| 365 |
other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
|
| 366 |
other_langs
|
| 367 |
)
|
|
|
|
| 375 |
f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
|
| 376 |
)
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
hover_text = (
|
| 379 |
f"<b>{country_name}</b><br><br>"
|
|
|
|
|
|
|
|
|
|
| 380 |
f"{'<br>'.join(lang_rows)}"
|
| 381 |
)
|
| 382 |
|