From: Marc Cromme Date: Fri, 11 May 2007 08:27:29 +0000 (+0000) Subject: added first examples of ICU transliterator token normalization X-Git-Tag: PAZPAR2.1.0.0~158 X-Git-Url: http://sru.miketaylor.org.uk/?a=commitdiff_plain;h=ae2621373444129f49c4063980554c5aed6cb57f;p=pazpar2-moved-to-github.git added first examples of ICU transliterator token normalization --- diff --git a/src/icu_I18N.h b/src/icu_I18N.h index df6cd2d..2461801 100644 --- a/src/icu_I18N.h +++ b/src/icu_I18N.h @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.h,v 1.8 2007-05-09 14:01:21 marc Exp $ +/* $Id: icu_I18N.h,v 1.9 2007-05-11 08:27:29 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -37,6 +37,8 @@ //#include #include //#include +#include + // forward declarations diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index a588b05..5b707ac 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.15 2007-05-11 06:48:32 adam Exp $ +/* $Id: test_icu_I18N.c,v 1.16 2007-05-11 08:27:29 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -316,26 +316,79 @@ void test_icu_I18N_sortmap(int argc, char **argv) void test_icu_I18N_transliterator(int argc, char **argv) { + + UErrorCode status = U_ZERO_ERROR; + UParseError parse_error[256]; - /* setting up transliterator */ + const char * rules8cstr = "[:Punctuation:] Any-Remove"; + //const char * rules8cstr = "[:Control:] Any-Remove"; + //const char * rules8cstr = "[:Decimal_Number:] Any-Remove\n"; + //const char * rules8cstr = "[:Final_Punctuation:] Any-Remove"; + //const char * rules8cstr = "Lower; [:^Letter:] Remove"; + //const char * rules8cstr = "[:^Number:] Remove"; + //const char * rules8cstr = "Lower;[[:WhiteSpace:][:Punctuation:]] Remove"; + //const char * rules8cstr = "NFD; [:Nonspacing Mark:] Remove; NFC"; + + + + + const char * src8cstr = "Genesis 1\n" + "The Beginning\n" + "1 In the beginning God created the heavens and the earth.\n" + "2 Now the earth was formless and empty, darkness was over " + "the surface of the deep, and the Spirit of God was hovering " + "over the waters.\n" + "3 And God said, Let there be light, and there was light.\n"; -#if 0 - UErrorCode status = U_ZERO_ERROR; - UParseError parse_error[256]; - int32_t id_cap = 256; - UChar id[256]; - id[0] = 0; + struct icu_buf_utf16 * rules16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0); + struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0); - trans = utrans_openU(id, id_len, UTRANS_FORWARD, - 0, 0, parse_error, &status); - - + icu_utf16_from_utf8_cstr(rules16, rules8cstr, &status); + icu_check_status(status); + + icu_utf16_from_utf8_cstr(src16, src8cstr, &status); + icu_check_status(status); + + UTransliterator * trans + = utrans_openU(rules16->utf16, rules16->utf16_len, + UTRANS_FORWARD, + 0, 0, + parse_error, &status); + + //= utrans_openU(0, 0, UTRANS_FORWARD, + // rules16->utf16, rules16->utf16_len, + // parse_error, &status); + + icu_check_status(status); if(U_FAILURE(status)) { - printf("Parse Error: line %d offset %d \n", - parse_error->line, parse_error->offset); + printf("Parse Error: \n line %d offset %d \n '%s'\n", + parse_error->line, parse_error->offset, + rules8cstr); } + + utrans_transUChars (trans, src16->utf16, &(src16->utf16_len), + src16->utf16_cap, + 0, &(src16->utf16_len), &status); + + icu_utf16_to_utf8(dest8, src16, &status); + icu_check_status(status); + + printf("Transliterator:\n%s\n", dest8->utf8); + + + utrans_close (trans); + icu_buf_utf16_destroy(rules16); + icu_buf_utf16_destroy(src16); + icu_buf_utf16_destroy(dest16); + icu_buf_utf8_destroy(dest8); + + +#if 0 + icu_check_status(status); @@ -394,6 +447,8 @@ void test_icu_I18N_transliterator(int argc, char **argv) } +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + int test_icu_tokenizer(const char * locale, char action, const char * src8cstr, int count) { @@ -523,67 +578,6 @@ int main(int argc, char **argv) -// CRAP to follow -#if 0 - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -void test_icu_I18N_casemap_failures(int argc, char **argv) -{ - - size_t buf_cap = 128; - char buf[buf_cap]; - size_t dest8_len = 0; - NMEM nmem = nmem_create(); - char * dest8 = 0; - - const char * src8 = "A ReD fOx hunTS sQUirriLs"; - //size_t src8_len = strlen(src8); - - //printf("original string: '%s' (%d)\n", src8, (int) src8_len); - - // some calling error needs investigation - dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len, - src8, "en", 't'); - YAZ_CHECK(0 == dest8_len); - //printf("icu_casemap 'en:t' '%s' (%d)\n", dest8, (int) dest8_len); - - - // attention: does not fail even if no locale 'xy_zz' defined - // it seems to default to english locale - dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len, - src8, "zz_abc", 'l'); - YAZ_CHECK(dest8_len); - //printf("icu_casemap 'zz:l' '%s' (%d)\n", dest8, (int) dest8_len); - - - // shall fail - no buf buffer defined - dest8 = icu_casemap(nmem, 0, buf_cap, &dest8_len, - src8, "en", 'l'); - YAZ_CHECK(0 == dest8_len); - //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len); - - // shall fail - no buf_cap defined - dest8 = icu_casemap(nmem, buf, 0, &dest8_len, - src8, "en", 'l'); - YAZ_CHECK(0 == dest8_len); - //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len); - - // shall fail - no action 'x' defined - dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len, - src8, "en", 'x'); - YAZ_CHECK(0 == dest8_len); - //printf("icu_casemap 'en:x' '%s' (%d)\n", dest8, (int) dest8_len); - - nmem_destroy(nmem); -} - - - -#endif - - - /* * Local variables: * c-basic-offset: 4