mirror of https://github.com/usememos/memos
feat: update Chinese translations in zh.json and zh-Hant.json using locale_updater.py (#1506)
update zh.json and zh-Hant.json using locale_updater.pypull/1507/head
parent
66ed43cbcb
commit
0020498c10
@ -0,0 +1,202 @@
|
||||
# Author: Oaklight
|
||||
# GitHub profile: https://github.com/Oaklight
|
||||
# Date: April 9, 2023
|
||||
# Description: This script is used to patch missing translations in a locale file.
|
||||
|
||||
# The script uses 'en.json' as the reference file to find missing keys in other locale files.
|
||||
# It iterates through each field and their entries in 'en.json' and checks if the same field/entry exists in other files.
|
||||
# If a field/entry is missing, the script prompts the source string, reference Google translation, and asks for confirmation or correction.
|
||||
# The resulting file is saved as './*.proposed.json', and you should review it before merging and uploading.
|
||||
|
||||
# usage: locale_updater.py [-h] ref_locale tgt_locale
|
||||
|
||||
#TODO: add other NMT system for different preference and accuracy
|
||||
|
||||
import json
|
||||
import requests
|
||||
|
||||
|
||||
def flatten_json(nested_json, parent_key="", sep=":"):
|
||||
flattened_dict = {}
|
||||
for key, value in nested_json.items():
|
||||
new_key = parent_key + sep + key if parent_key else key
|
||||
if isinstance(value, dict):
|
||||
flattened_dict.update(flatten_json(value, new_key, sep))
|
||||
else:
|
||||
flattened_dict[new_key] = value
|
||||
return flattened_dict
|
||||
|
||||
|
||||
def unflatten_json(flattened_dict, sep=":"):
|
||||
nested_json = {}
|
||||
for key, value in flattened_dict.items():
|
||||
parts = key.split(sep)
|
||||
current = nested_json
|
||||
for part in parts[:-1]:
|
||||
if part not in current:
|
||||
current[part] = {}
|
||||
current = current[part]
|
||||
current[parts[-1]] = value
|
||||
return nested_json
|
||||
|
||||
|
||||
def sort_nested_json(nested_json):
|
||||
if isinstance(nested_json, dict):
|
||||
sorted_dict = {}
|
||||
for key in sorted(nested_json.keys()):
|
||||
sorted_dict[key] = sort_nested_json(nested_json[key])
|
||||
return sorted_dict
|
||||
elif isinstance(nested_json, list):
|
||||
sorted_list = []
|
||||
for item in nested_json:
|
||||
sorted_list.append(sort_nested_json(item))
|
||||
return sorted_list
|
||||
else:
|
||||
return nested_json
|
||||
|
||||
|
||||
def google_translate(
|
||||
source_text, source_language="en", target_language="zh-CN"
|
||||
):
|
||||
# Create post content
|
||||
new_line = "\r\n"
|
||||
post_content = "q=" + source_text.replace(new_line, " ")
|
||||
|
||||
# Send post request and get JSON response, using source_language and target_language
|
||||
# url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t"
|
||||
url = f"https://translate.googleapis.com/translate_a/single?client=gtx&sl={source_language}&tl={target_language}&dt=t"
|
||||
headers = {"Content-type": "application/x-www-form-urlencoded"}
|
||||
response = requests.post(url, headers=headers, data=post_content.encode("utf-8"))
|
||||
json_value = response.json()
|
||||
|
||||
# Extract translations from JSON
|
||||
translations = [item[0] for item in json_value[0]]
|
||||
translations = [t.replace(new_line, "") for t in translations]
|
||||
target_text = translations[0]
|
||||
|
||||
return target_text
|
||||
|
||||
|
||||
def get_code_name(json_filename):
|
||||
# Remove extension and split language and country codes
|
||||
file_parts = json_filename.split(".")[0].split("_")
|
||||
lang_code = file_parts[0]
|
||||
country_code = file_parts[1] if len(file_parts) > 1 else ""
|
||||
|
||||
# Map language code to code name
|
||||
lang_map = {
|
||||
"de": "de",
|
||||
"en": "en",
|
||||
"es": "es",
|
||||
"fr": "fr",
|
||||
"it": "it",
|
||||
"ko": "ko",
|
||||
"nl": "nl",
|
||||
"pl": "pl",
|
||||
"pt": "pt-BR",
|
||||
"ru": "ru",
|
||||
"sl": "sl",
|
||||
"sv": "sv",
|
||||
"tr": "tr",
|
||||
"uk": "uk",
|
||||
"vi": "vi",
|
||||
"zh-Hant": "zh-TW",
|
||||
"zh": "zh-CN",
|
||||
}
|
||||
code_name = lang_map.get(lang_code, "")
|
||||
|
||||
# Add country code if available
|
||||
if country_code:
|
||||
code_name += "-" + country_code.upper()
|
||||
|
||||
return code_name
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# ref_locale = "./en.json"
|
||||
# tgt_locale = "./zh.json"
|
||||
# receive the reference locale and target locale from the command line using argparse
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("ref_locale", help="reference locale file")
|
||||
parser.add_argument("tgt_locale", help="target locale file")
|
||||
args = parser.parse_args()
|
||||
ref_locale = args.ref_locale
|
||||
tgt_locale = args.tgt_locale
|
||||
|
||||
ref_codename = get_code_name(ref_locale)
|
||||
tgt_codename = get_code_name(tgt_locale)
|
||||
|
||||
with open(ref_locale, "r") as f:
|
||||
ref = json.load(f)
|
||||
|
||||
with open(tgt_locale, "r") as f:
|
||||
tgt = json.load(f)
|
||||
|
||||
# using the flatten_json function, produce a temp json for each locale and save to the disk
|
||||
ref_flat = flatten_json(ref)
|
||||
tgt_flat = flatten_json(tgt)
|
||||
|
||||
# # save the flattened json to the disk
|
||||
# with open("ref_flat.json", "w") as f:
|
||||
# json.dump(ref_flat, f, indent=2, ensure_ascii=False)
|
||||
# with open("tgt_flat.json", "w") as f:
|
||||
# json.dump(tgt_flat, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# first diff the keys to inform the user of the missing keys
|
||||
missing_keys = set(ref_flat.keys()) - set(tgt_flat.keys())
|
||||
# print total number of missing keys, in red color, number as default
|
||||
if len(missing_keys) == 0:
|
||||
print("\033[92m All keys are present in the target locale \033[0m")
|
||||
exit()
|
||||
else:
|
||||
print(f"\033[91m Total missing keys: \033[0m {len(missing_keys)}")
|
||||
|
||||
|
||||
# formatted print line by line, wrap the missing key in red color, and the English translation in green color
|
||||
for key in missing_keys:
|
||||
# print(f"Missing key: {key} | English: {ref_flat[key]}")
|
||||
print(
|
||||
"\033[91m"
|
||||
+ f"Missing key: {key}"
|
||||
+ "\033[0m"
|
||||
+ " | "
|
||||
+ "\033[92m"
|
||||
+ f"English: {ref_flat[key]}"
|
||||
+ "\033[0m"
|
||||
)
|
||||
print("=============================================")
|
||||
print(f"\033[91m Total missing keys: \033[0m {len(missing_keys)}")
|
||||
|
||||
|
||||
# now compare the tgt_flat with ref_flat to find all missing keys and prompt to terminal for translation. Then save back to the tgt_flat
|
||||
|
||||
# iterate over the missing key and their corresponding values in ref_flat, to get reference google translation using google_translate_to_chinese function
|
||||
# then present the reference translation to the user in the terminal
|
||||
# then present the user with a prompt to ask for translation
|
||||
for i, key in enumerate(missing_keys):
|
||||
print(
|
||||
f"============================================= {i + 1}/{len(missing_keys)}"
|
||||
)
|
||||
# print wrap the missing key in red color, and the English translation in green color
|
||||
print("\033[91m" + "Missing key: " + "\033[0m" + key)
|
||||
print("\033[92m" + f"{ref_codename}: " + "\033[0m" + ref_flat[key])
|
||||
# get reference translation from google translate, print in blue
|
||||
proposal_google = google_translate(ref_flat[key], ref_codename, tgt_codename)
|
||||
print("\033[94m" + f"Reference {tgt_codename} translation: " + "\033[0m" + proposal_google)
|
||||
# prompt user for translation, or enter to use the reference translation, in green color
|
||||
proposal = input("\033[92m" + "Enter translation: " + "\033[0m")
|
||||
if proposal == "":
|
||||
proposal = proposal_google
|
||||
# save the translation to the tgt_flat
|
||||
tgt_flat[key] = proposal
|
||||
|
||||
# unflatten the ref_flat.json and tgt_flat.json back to the original format. save to another file
|
||||
ref_unflat = unflatten_json(ref_flat)
|
||||
tgt_unflat = unflatten_json(tgt_flat)
|
||||
# save the unflattened json to the disk, with original tgt file name with ".proposed" appended before .json
|
||||
# by getting the file name from from the tgt_locale path
|
||||
tgt_locale_name = tgt_locale.split("/")[-1].split(".")[0]
|
||||
with open(f"{tgt_locale_name}.proposed.json", "w") as f:
|
||||
json.dump(tgt_unflat, f, indent=2, ensure_ascii=False)
|
Loading…
Reference in New Issue