[virt-tools-list] [libosinfo 7/8] rfc: Infer ISO language from label

Wed Dec 5 17:00:41 UTC 2012

On Mon, Dec 3, 2012 at 1:23 PM, Christophe Fergeau <cfergeau at redhat.com> wrote:
> Now that libosinfo has an osinfo_db_identify_media method which
> modifies the media it was passed, we can generate properties which
> needs information from the media stored in the OsinfoDB, and
> information from the actual media (ISO volume ID).
> This is useful to guess what languages are supported by a given
> Windows ISO: the end of the ISO volume ID has a language code, which
> we can translate to a locale identifier.
>
> This commit adds a lang-regex property to the OsinfoDB database to
> extract the language code from Windows ISO volume IDs, and
> then add mapping tables to turn it into a locale identifier.
> ---
>  data/oses/windows.xml.in   |   2 +
>  data/schemas/libosinfo.rng |   5 ++
>  osinfo/libosinfo.syms      |   4 +-
>  osinfo/osinfo_db.c         | 177 +++++++++++++++++++++++++++++++++++++++++++++
>  osinfo/osinfo_loader.c     |   4 +-
>  osinfo/osinfo_media.c      |  67 ++++++++++++++++-
>  osinfo/osinfo_media.h      |   3 +
>  7 files changed, 258 insertions(+), 4 deletions(-)
>
> diff --git a/data/oses/windows.xml.in b/data/oses/windows.xml.in
> index d09e873..e8c29f9 100644
> --- a/data/oses/windows.xml.in
> +++ b/data/oses/windows.xml.in
> @@ -739,12 +739,14 @@
>        <iso>
>          <volume-id>(HB1_CCPA_X86FRE|HRM_CCSA_X86FRE|HRM_CCSA_X86CHK|HRM_CCSNA_X86CHK|HRM_CCSNA_X86FRE|HRM_CENA_X86FREV|HRM_CENA_X86CHKV|HRM_CENNA_X86FREV|HRM_CENNA_X86CHKV|HRM_CPRA_X86FREV|HRM_CPRNA_X86FREV)_</volume-id>
>          <publisher-id>MICROSOFT CORPORATION</publisher-id>
> +        <lang-regex>[[:upper:][:digit:]_]*_([[:upper:]]*-[[:upper:]]*)</lang-regex>
>        </iso>
>      </media>
>      <media arch="x86_64">
>        <iso>
>          <volume-id>(HB1_CCPA_X64FRE|HRM_CCSA_X64FRE|HRM_CCSA_X64CHK|HRM_CCSNA_X64FRE|HRM_CCSNA_X64CHK|HRM_CENNA_X64FREV|HRM_CENNA_X64CHKV|HRM_CENA_X64FREV|HRM_CENA_X64CHKV|HRM_CPRA_X64FREV|HRM_CPRNA_X64FREV)_</volume-id>
>          <publisher-id>MICROSOFT CORPORATION</publisher-id>
> +        <lang-regex>[[:upper:][:digit:]_]*_([[:upper:]]*-[[:upper:]]*)</lang-regex>
>        </iso>
>      </media>
>
> diff --git a/data/schemas/libosinfo.rng b/data/schemas/libosinfo.rng
> index 87635dd..36fa1a1 100644
> --- a/data/schemas/libosinfo.rng
> +++ b/data/schemas/libosinfo.rng
> @@ -281,6 +281,11 @@
>              <text/>
>            </element>
>          </optional>
> +        <optional>
> +          <element name='lang-regex'>
> +            <text/>
> +          </element>
> +        </optional>
>        </interleave>
>      </element>
>    </define>
> diff --git a/osinfo/libosinfo.syms b/osinfo/libosinfo.syms
> index d45e58e..7c3efe1 100644
> --- a/osinfo/libosinfo.syms
> +++ b/osinfo/libosinfo.syms
> @@ -341,11 +341,11 @@ LIBOSINFO_0.2.2 {
>         osinfo_install_config_set_target_disk;
>         osinfo_install_config_get_script_disk;
>         osinfo_install_config_set_script_disk;
> -
>         osinfo_install_script_get_avatar_format;
>         osinfo_install_script_get_path_format;
> -
>         osinfo_install_script_get_product_key_format;
> +
> +       osinfo_media_get_languages;
>  } LIBOSINFO_0.2.1;
>
>  /* Symbols in next release...
> diff --git a/osinfo/osinfo_db.c b/osinfo/osinfo_db.c
> index 46101d6..2c2eb5a 100644
> --- a/osinfo/osinfo_db.c
> +++ b/osinfo/osinfo_db.c
> @@ -38,6 +38,177 @@ G_DEFINE_TYPE (OsinfoDb, osinfo_db, G_TYPE_OBJECT);
>       (((str) != NULL) &&                                                \
>        g_regex_match_simple((pattern), (str), 0, 0)))
>
> +static gchar *get_raw_lang(const char *volume_id, const gchar *regex_str)
> +{
> +    GRegex *regex;
> +    GMatchInfo *match;
> +    gboolean matched;
> +    gchar *raw_lang = NULL;
> +
> +    regex = g_regex_new(regex_str, G_REGEX_ANCHORED,
> +                        G_REGEX_MATCH_ANCHORED, NULL);
> +    if (regex == NULL)
> +        return NULL;
> +
> +    matched = g_regex_match(regex, volume_id, G_REGEX_MATCH_ANCHORED, &match);
> +    if (!matched || !g_match_info_matches(match))
> +        goto end;
> +    raw_lang = g_match_info_fetch(match, 1);
> +    if (raw_lang == NULL)
> +        goto end;
> +
> +end:
> +    g_match_info_unref(match);
> +    g_regex_unref(regex);
> +
> +    return raw_lang;
> +}
> +
> +struct LanguageMapping {
> +    const char *iso_label_lang;
> +    const char *gettext_lang;
> +};
> +
> +static GHashTable *init_win_lang_map(void)
> +{
> +    GHashTable *lang_map;
> +    const struct LanguageMapping lang_table[] = {
> +        /* ISO label strings up to Windows 7 */
> +        { "EN", "en_US" },
> +        { "AR", "ar_SA" },
> +        { "BG", "bg_BG" },
> +        { "HK", "zh_HK" },
> +        { "CN", "zh_CN" },
> +        { "TW", "zh_TW" },
> +        { "HR", "hr_HR" },
> +        { "CS", "cs_CZ" },
> +        { "DA", "da_DK" },
> +        { "NL", "nl_NL" },
> +        { "ET", "et_EE" },
> +        { "FI", "fi_FI" },
> +        { "FR", "fr_FR" },
> +        { "DE", "de_DE" },
> +        { "EL", "el_GR" },
> +        { "HE", "he_IL" },
> +        { "HU", "hu_HU" },
> +        { "IT", "it_IT" },
> +        { "JA", "ja_JP" },
> +        { "KO", "ko_KR" },
> +        { "LV", "lv_LV" },
> +        { "LT", "lt_LT" },
> +        { "NO", "nb_NO" },
> +        { "PL", "pl_PL" },
> +        { "BR", "pt_BR" },
> +        { "PT", "pt_PT" },
> +        { "RO", "ro_RO" },
> +        { "RU", "ru_RU" },
> +        { "SRL", "sr_RS at latin" },
> +        { "SK", "sk_SK" },
> +        { "SL", "sl_SI" },
> +        { "ES", "es_ES" },
> +        { "SV", "sv_SE" },
> +        { "TH", "th_TH" },
> +        { "TR", "tr_TR" },
> +        { "UK", "uk_UA" },
> +
> +        /* starting from Windows 8, the ISO label contains both
> +         * language and country code */
> +        { "EN-US", "en_US" },
> +        { "EN-GB", "en_GB" },
> +        { "AR-SA", "ar_SA" },
> +        { "BG-BG", "bg_BG" },
> +        { "ZH-HK", "zh_HK" },
> +        { "ZH-CN", "zh_CN" },
> +        { "ZH-TW", "zh_TW" },
> +        { "HR-HR", "hr_HR" },
> +        { "CS-CZ", "cs_CZ" },
> +        { "DA-DK", "da_DK" },
> +        { "NL-NL", "nl_NL" },
> +        { "ET-EE", "et_EE" },
> +        { "FI-FI", "fi_FI" },
> +        { "FR-FR", "fr_FR" },
> +        { "DE-DE", "de_DE" },
> +        { "EL-GR", "el_GR" },
> +        { "HE-IL", "he_IL" },
> +        { "HU-HU", "hu_HU" },
> +        { "IT-IT", "it_IT" },
> +        { "JA-JP", "ja_JP" },
> +        { "KO-KR", "ko_KR" },
> +        { "LV-LV", "lv_LV" },
> +        { "LT-LT", "lt_LT" },
> +        { "NB-NO", "nb_NO" },
> +        { "PL-PL", "pl_PL" },
> +        { "PT-BR", "pt_BR" },
> +        { "PT-PT", "pt_PT" },
> +        { "RO-RO", "ro_RO" },
> +        { "RU-RU", "ru_RU" },
> +        { "SR-LATN-CS", "sr_RS at latin" },
> +        { "SK-SK", "sk_SK" },
> +        { "SL-SI", "sl_SI" },
> +        { "ES-ES", "es_ES" },
> +        { "SV-SE", "sv_SE" },
> +        { "TH-TH", "th_TH" },
> +        { "TR-TR", "tr_TR" },
> +        { "UK-UA", "uk_UA" },
> +
> +        { "EU-ES", "eu_ES" }, //language pack
> +        { "CA-ES", "ca_ES" }, //language pack
> +        { "GL-ES", "gl_ES" }, //language pack
> +        { "KY-KG", "ky_KG" }, //language pack
> +
> +        { NULL, NULL }
> +    };

Seems all of these except for 1 can be covered by a simple 's/-/_/'
conversion and thus do not need all this hard coding.

Rest of the patch looks good now as a first implementation. We can
make use of the datamaps API here once that API is available.

-- 
Regards,

Zeeshan Ali (Khattak)
FSF member#5124