-/* $Id: icu_I18N.c,v 1.2 2007-05-01 08:17:05 marc Exp $
+/* $Id: icu_I18N.c,v 1.3 2007-05-01 13:16:09 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
//#include <unicode/ustdio.h>
//#include <unicode/utypes.h> /* Basic ICU data types */
-//#include <unicode/ucol.h>
+#include <unicode/ucol.h>
//#include <unicode/ucnv.h> /* C Converter API */
//#include <unicode/uloc.h>
//#include <unicode/ubrk.h>
const char *locale, char action);
-// source code
+// source code of all functions
int icu_check_status (UErrorCode status)
{
}
+struct icu_termmap * icu_termmap_create(NMEM nmem)
+{
+ struct icu_termmap *itmp = nmem_malloc(nmem, sizeof(*itmp));
+ itmp->sort_len = 0;
+ itmp->sort_key = 0;
+ itmp->norm_term = 0;
+ itmp->disp_term = 0;
+ return itmp;
+};
+
+int icu_termmap_cmp(const void *vp1, const void *vp2)
+{
+ struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
+ struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
+ int cmp = 0;
+
+#if 0
+ size_t len = itmp1->sort_len;
+ // minimum sortkey length
+ if (itmp2->sort_len < len)
+ len = itmp2->sort_len;
+
+ cmp = strncmp(itmp1->sort_key, itmp2->sort_key, len);
+
+ if (cmp == 0 && (itmp1->sort_len < itmp2->sort_len))
+ cmp = -1;
+
+ if (cmp == 0 && (itmp1->sort_len > itmp2->sort_len))
+ cmp = 1;
+#else
+ cmp = strcmp(itmp1->sort_key, itmp2->sort_key);
+#endif
+ return cmp;
+}
+
+
+
+char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap,
+ size_t *dest8_len, const char *src8,
+ const char *locale)
+{
+ size_t src8_len = strlen(src8);
+ int32_t buf_len = 0;
+ char * dest8 = 0;
+
+ if (dest8_len)
+ *dest8_len = 0;
+
+ if (!buf || !(buf_cap > 0) || !src8_len)
+ return 0;
+
+ // converting buf to utf16
+ buf = (char *)icu_utf16_from_utf8n((UChar *) buf,
+ (int32_t) buf_cap, &buf_len,
+ src8, src8_len);
+
+ // sort mapping
+ //buf_len = (size_t) icu_utf16_casemap((UChar *)buf, (int32_t) buf_cap,
+ // (const UChar *)buf, (int32_t) buf_len,
+ // locale, action);
+
+
+ {
+ UErrorCode status = U_ZERO_ERROR;
+
+ UCollator * coll = ucol_open (locale, &status);
+ if (U_ZERO_ERROR != icu_check_status(status))
+ buf_len = 0;
+
+ ucol_getSortKey(coll, (const UChar *) buf, (int32_t) buf_len,
+ (uint8_t *) buf, (int32_t) buf_cap);
+
+ ucol_close(coll);
+ }
+
+
+ // copying out to nmem
+ buf[buf_len] = '\0';
+
+ if(dest8_len)
+ *dest8_len = buf_len;
+
+ dest8 = nmem_strdup(nmem, buf);
+ return dest8;
+}
+
+
#endif // HAVE_ICU
-/* $Id: icu_I18N.h,v 1.2 2007-05-01 08:17:05 marc Exp $
+/* $Id: icu_I18N.h,v 1.3 2007-05-01 13:16:09 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
//#include <unicode/unistr.h>
+
+struct icu_termmap
+{
+ size_t sort_len; // needed, because
+ char * sort_key; // no guarantee on '\0' termination ???
+ char * norm_term; // standard C utf-8 string
+ char * disp_term; // standard C utf-8 string
+};
+
+struct icu_termmap * icu_termmap_create(NMEM nmem);
+
+int icu_termmap_cmp(const void *vp1, const void *vp2);
+
char * icu_casemap(NMEM nmem, char *buf, size_t buf_cap,
size_t *dest8_len, const char *src8,
const char *locale, char action);
+char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap,
+ size_t *dest8_len, const char *src8,
+ const char *locale);
+
#endif // HAVE_ICU
#endif // ICU_I18NL_H
-/* $Id: test_icu_I18N.c,v 1.4 2007-05-01 08:10:26 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.5 2007-05-01 13:16:09 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
#ifdef HAVE_ICU
#include "icu_I18N.h"
-#include "string.h"
+#include <string.h>
+#include <stdlib.h>
// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
}
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
void test_icu_I18N_casemap_failures(int argc, char **argv)
{
nmem_destroy(nmem);
}
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+int test_icu_sortmap(const char * locale, size_t list_len,
+ const char ** src8_list, const char ** check8_list)
+{
+ int sucess = 1;
+
+ size_t i = 0;
+
+
+ NMEM nmem = nmem_create();
+ size_t buf_cap = 128;
+ char buf[buf_cap];
+ struct icu_termmap ** dest8_list
+ = nmem_malloc(nmem, sizeof(struct icu_termmap *) * list_len);
+ //size_t dest8_len = 0;
+ //size_t src8_len = strlen(src8);
+
+ // initializing icu_termmap
+ for (i = 0; i < list_len; i++){
+ dest8_list[i] = icu_termmap_create(nmem);
+ dest8_list[i]->norm_term = nmem_strdup(nmem, src8_list[i]);
+ dest8_list[i]->disp_term = nmem_strdup(nmem, src8_list[i]);
+ //dest8_list[i]->sort_key = nmem_strdup(nmem, src8_list[i]);
+ //dest8_list[i]->sort_len = strlen(src8_list[i]);
+ dest8_list[i]->sort_key
+ = icu_sortmap(nmem, buf, buf_cap, &(dest8_list[i]->sort_len),
+ src8_list[i], locale);
+ }
+
+ // do the sorting
+ qsort(dest8_list, list_len,
+ sizeof(struct icu_termmap *), icu_termmap_cmp);
+
+ // checking correct sorting
+ for (i = 0; i < list_len; i++){
+ if (0 != strcmp(dest8_list[i]->disp_term, check8_list[i])){
+ sucess = 0;
+ }
+ }
+
+ if (!sucess)
+ for (i = 0; i < list_len; i++){
+ printf("icu_sortmap '%s': '%s' '%s'\n", locale,
+ dest8_list[i]->disp_term, check8_list[i]);
+ }
+
+ nmem_destroy(nmem);
+
+ return sucess;
+}
+
+
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+void test_icu_I18N_sortmap(int argc, char **argv)
+{
+
+ // sucessful tests
+ size_t en_1_len = 6;
+ const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
+ const char * en_1_cck[6] = {"a", "A", "K", "k", "z", "Z"};
+ YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
+
+ // sucessful tests - this one fails and should not!!!
+ size_t da_1_len = 6;
+ const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
+ const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
+ YAZ_CHECK(0 == test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
+
+ // sucessful tests
+ size_t de_1_len = 9;
+ const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
+ const char * de_1_cck[9] = {"ä", "a", "o", "ö", "s", "ß", "t", "u", "ü"};
+ YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
+
+}
+
+
#endif
// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
test_icu_I18N_casemap_failures(argc, argv);
test_icu_I18N_casemap(argc, argv);
+ test_icu_I18N_sortmap(argc, argv);
#else
}
-
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
/*
* Local variables: