Aeon-Avinash's picture
Update lang_codes.py
c96c4a2 verified
import requests
import json
def generate_lang_code_file():
# URL of the Flores README containing the language codes
url = 'https://raw.githubusercontent.com/openlanguagedata/flores/main/README.md'
# Fetch the page content
response = requests.get(url)
content = response.text
# Extract the table content by parsing the plain text
lines = content.split('\n')
# Initialize a flag to start capturing data
languages = []
start_parsing = False
for line in lines:
if "Language coverage" in line:
start_parsing = True
continue
if start_parsing:
if line.strip() == "":
continue
if '|' not in line:
continue
parts = line.split('|')
if len(parts) >= 2:
code = parts[1].strip()[1:-1]
identifier = parts[2].strip()[1:-1]
name = parts[3].strip()
languages.append({"code": code, "identifier": identifier, "name": name})
# Omit the labels and divider
languages = languages[2:]
# Convert to JSON
json_data = json.dumps(languages, indent=4)
# Save the JSON data to a file
file_path = 'flores_language_codes.json'
with open(file_path, 'w') as file:
file.write(json_data)
print(f"JSON data saved to {file_path}")
# generate_lang_code_file()
def get_language_code(language_name,
json_file_path='flores_language_codes.json'):
# Load the JSON data from the file
with open(json_file_path, 'r') as file:
languages = json.load(file)
# Search for the language code by language name
for language in languages:
if language['name'].lower() == language_name.lower():
return language['code']
return None # Return None if the language name is not found
def get_language_list(
json_file_path='flores_language_codes.json'):
# Load the JSON data from the file
with open(json_file_path, 'r') as file:
languages = json.load(file)
# extract language name
language_names = [language['name'] for language in languages]
return language_names