From b25ec6b732e94d0f8ee3582328ca2b1b34a4cc4a Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Tue, 1 May 2007 13:16:09 +0000 Subject: [PATCH] Added sorting test for ICU - only used in test_icu_I18N.c so far. English and german sorting tests perform fine (including german special characters), but sorting of danish special characters fails. Very suspect. Needs more investigation! See test_icu_I18N_sortmap() in test_icu_I18N.c for details --- src/icu_I18N.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/icu_I18N.h | 19 ++++++++++- src/test_icu_I18N.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 194 insertions(+), 7 deletions(-) diff --git a/src/icu_I18N.c b/src/icu_I18N.c index 7044160..25951fe 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.c,v 1.2 2007-05-01 08:17:05 marc Exp $ +/* $Id: icu_I18N.c,v 1.3 2007-05-01 13:16:09 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -42,7 +42,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA //#include //#include /* Basic ICU data types */ -//#include +#include //#include /* C Converter API */ //#include //#include @@ -77,7 +77,7 @@ int32_t icu_utf16_casemap(UChar *dest16, int32_t dest16_cap, const char *locale, char action); -// source code +// source code of all functions int icu_check_status (UErrorCode status) { @@ -208,6 +208,93 @@ char * icu_casemap(NMEM nmem, char *buf, size_t buf_cap, } +struct icu_termmap * icu_termmap_create(NMEM nmem) +{ + struct icu_termmap *itmp = nmem_malloc(nmem, sizeof(*itmp)); + itmp->sort_len = 0; + itmp->sort_key = 0; + itmp->norm_term = 0; + itmp->disp_term = 0; + return itmp; +}; + +int icu_termmap_cmp(const void *vp1, const void *vp2) +{ + struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1; + struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2; + int cmp = 0; + +#if 0 + size_t len = itmp1->sort_len; + // minimum sortkey length + if (itmp2->sort_len < len) + len = itmp2->sort_len; + + cmp = strncmp(itmp1->sort_key, itmp2->sort_key, len); + + if (cmp == 0 && (itmp1->sort_len < itmp2->sort_len)) + cmp = -1; + + if (cmp == 0 && (itmp1->sort_len > itmp2->sort_len)) + cmp = 1; +#else + cmp = strcmp(itmp1->sort_key, itmp2->sort_key); +#endif + return cmp; +} + + + +char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap, + size_t *dest8_len, const char *src8, + const char *locale) +{ + size_t src8_len = strlen(src8); + int32_t buf_len = 0; + char * dest8 = 0; + + if (dest8_len) + *dest8_len = 0; + + if (!buf || !(buf_cap > 0) || !src8_len) + return 0; + + // converting buf to utf16 + buf = (char *)icu_utf16_from_utf8n((UChar *) buf, + (int32_t) buf_cap, &buf_len, + src8, src8_len); + + // sort mapping + //buf_len = (size_t) icu_utf16_casemap((UChar *)buf, (int32_t) buf_cap, + // (const UChar *)buf, (int32_t) buf_len, + // locale, action); + + + { + UErrorCode status = U_ZERO_ERROR; + + UCollator * coll = ucol_open (locale, &status); + if (U_ZERO_ERROR != icu_check_status(status)) + buf_len = 0; + + ucol_getSortKey(coll, (const UChar *) buf, (int32_t) buf_len, + (uint8_t *) buf, (int32_t) buf_cap); + + ucol_close(coll); + } + + + // copying out to nmem + buf[buf_len] = '\0'; + + if(dest8_len) + *dest8_len = buf_len; + + dest8 = nmem_strdup(nmem, buf); + return dest8; +} + + #endif // HAVE_ICU diff --git a/src/icu_I18N.h b/src/icu_I18N.h index 971b0ca..299058d 100644 --- a/src/icu_I18N.h +++ b/src/icu_I18N.h @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.h,v 1.2 2007-05-01 08:17:05 marc Exp $ +/* $Id: icu_I18N.h,v 1.3 2007-05-01 13:16:09 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -39,10 +39,27 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA //#include + +struct icu_termmap +{ + size_t sort_len; // needed, because + char * sort_key; // no guarantee on '\0' termination ??? + char * norm_term; // standard C utf-8 string + char * disp_term; // standard C utf-8 string +}; + +struct icu_termmap * icu_termmap_create(NMEM nmem); + +int icu_termmap_cmp(const void *vp1, const void *vp2); + char * icu_casemap(NMEM nmem, char *buf, size_t buf_cap, size_t *dest8_len, const char *src8, const char *locale, char action); +char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap, + size_t *dest8_len, const char *src8, + const char *locale); + #endif // HAVE_ICU #endif // ICU_I18NL_H diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index c8862f6..c4f3067 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.4 2007-05-01 08:10:26 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.5 2007-05-01 13:16:09 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -37,7 +37,8 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #ifdef HAVE_ICU #include "icu_I18N.h" -#include "string.h" +#include +#include // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 @@ -140,6 +141,8 @@ void test_icu_I18N_casemap(int argc, char **argv) } +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + void test_icu_I18N_casemap_failures(int argc, char **argv) { @@ -190,6 +193,85 @@ void test_icu_I18N_casemap_failures(int argc, char **argv) nmem_destroy(nmem); } +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +int test_icu_sortmap(const char * locale, size_t list_len, + const char ** src8_list, const char ** check8_list) +{ + int sucess = 1; + + size_t i = 0; + + + NMEM nmem = nmem_create(); + size_t buf_cap = 128; + char buf[buf_cap]; + struct icu_termmap ** dest8_list + = nmem_malloc(nmem, sizeof(struct icu_termmap *) * list_len); + //size_t dest8_len = 0; + //size_t src8_len = strlen(src8); + + // initializing icu_termmap + for (i = 0; i < list_len; i++){ + dest8_list[i] = icu_termmap_create(nmem); + dest8_list[i]->norm_term = nmem_strdup(nmem, src8_list[i]); + dest8_list[i]->disp_term = nmem_strdup(nmem, src8_list[i]); + //dest8_list[i]->sort_key = nmem_strdup(nmem, src8_list[i]); + //dest8_list[i]->sort_len = strlen(src8_list[i]); + dest8_list[i]->sort_key + = icu_sortmap(nmem, buf, buf_cap, &(dest8_list[i]->sort_len), + src8_list[i], locale); + } + + // do the sorting + qsort(dest8_list, list_len, + sizeof(struct icu_termmap *), icu_termmap_cmp); + + // checking correct sorting + for (i = 0; i < list_len; i++){ + if (0 != strcmp(dest8_list[i]->disp_term, check8_list[i])){ + sucess = 0; + } + } + + if (!sucess) + for (i = 0; i < list_len; i++){ + printf("icu_sortmap '%s': '%s' '%s'\n", locale, + dest8_list[i]->disp_term, check8_list[i]); + } + + nmem_destroy(nmem); + + return sucess; +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_sortmap(int argc, char **argv) +{ + + // sucessful tests + size_t en_1_len = 6; + const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"}; + const char * en_1_cck[6] = {"a", "A", "K", "k", "z", "Z"}; + YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck)); + + // sucessful tests - this one fails and should not!!! + size_t da_1_len = 6; + const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"}; + const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"}; + YAZ_CHECK(0 == test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck)); + + // sucessful tests + size_t de_1_len = 9; + const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"}; + const char * de_1_cck[9] = {"ä", "a", "o", "ö", "s", "ß", "t", "u", "ü"}; + YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck)); + +} + + #endif // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 @@ -204,6 +286,7 @@ int main(int argc, char **argv) test_icu_I18N_casemap_failures(argc, argv); test_icu_I18N_casemap(argc, argv); + test_icu_I18N_sortmap(argc, argv); #else @@ -217,7 +300,7 @@ int main(int argc, char **argv) } - +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 /* * Local variables: -- 1.7.10.4