From b3356073151093f1f2267ee7bf75139f4d650d40 Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Wed, 16 May 2007 12:39:49 +0000 Subject: [PATCH] temorarily commented faulty transliterator test out progress on ICU chain test, but need to fix transliterator test first --- src/icu_I18N.c | 209 ++++++++++++++++++++++++++++++++++++++++++++++----- src/icu_I18N.h | 30 ++++++-- src/test_icu_I18N.c | 115 +++++++++++++++++++++++++++- 3 files changed, 327 insertions(+), 27 deletions(-) diff --git a/src/icu_I18N.c b/src/icu_I18N.c index cc9f343..c8ce0f2 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.c,v 1.13 2007-05-15 15:11:42 marc Exp $ +/* $Id: icu_I18N.c,v 1.14 2007-05-16 12:39:49 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -716,7 +716,7 @@ int icu_normalizer_normalize(struct icu_normalizer * normalizer, struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, enum icu_chain_step_type type, const uint8_t * rule, - struct icu_buf_utf16 * src16, + struct icu_buf_utf16 * buf16, UErrorCode *status) { struct icu_chain_step * step = 0; @@ -726,6 +726,14 @@ struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step)); + step->type = type; + step->more_tokens = 0; + + if (buf16) + step->buf16 = buf16; + else + step->buf16 = 0; + // create auxilary objects switch(step->type) { case ICU_chain_step_type_display: @@ -747,10 +755,6 @@ struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, break; } - if (src16) - step->src16 = src16; - - return step; }; @@ -759,14 +763,8 @@ void icu_chain_step_destroy(struct icu_chain_step * step){ if (!step) return; - - if (step->previous) - icu_chain_step_destroy(step->previous); - - if (step->src16) - icu_buf_utf16_destroy(step->src16); - // destroy last living icu_chain_step + icu_chain_step_destroy(step->previous); switch(step->type) { case ICU_chain_step_type_display: @@ -776,12 +774,15 @@ void icu_chain_step_destroy(struct icu_chain_step * step){ case ICU_chain_step_type_sort: break; case ICU_chain_step_type_charmap: + icu_buf_utf16_destroy(step->buf16); break; case ICU_chain_step_type_normalize: icu_normalizer_destroy(step->u.normalizer); + icu_buf_utf16_destroy(step->buf16); break; case ICU_chain_step_type_tokenize: icu_tokenizer_destroy(step->u.tokenizer); + icu_buf_utf16_destroy(step->buf16); break; default: break; @@ -837,33 +838,47 @@ struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, { struct icu_chain_step * step = 0; struct icu_buf_utf16 * src16 = 0; + struct icu_buf_utf16 * buf16 = 0; if (!chain || !type || !rule) return 0; - //if(chain->steps && chain->steps->src16) + // assign utf16 src buffers as needed + if (chain->steps && chain->steps->buf16) + src16 = chain->steps->buf16; + else if (chain->src16) + src16 = chain->src16; + else + return 0; + - // assign utf16 src buffers as needed - switch(step->type) { + // assign utf16 destination buffers as needed, or + // re-use previous uft18 buffer if this step does not touch it + switch(type) { case ICU_chain_step_type_display: + buf16 = src16; break; case ICU_chain_step_type_norm: + buf16 = src16; break; case ICU_chain_step_type_sort: + buf16 = src16; break; case ICU_chain_step_type_charmap: + buf16 = icu_buf_utf16_create(0); break; case ICU_chain_step_type_normalize: + buf16 = icu_buf_utf16_create(0); break; case ICU_chain_step_type_tokenize: + buf16 = icu_buf_utf16_create(0); break; default: break; } // create actual chain step with this buffer - // leave zero for implicit buffer creation - step = icu_chain_step_create(chain, type, rule, src16, status); + step = icu_chain_step_create(chain, type, rule, buf16, status); step->previous = chain->steps; chain->steps = step; @@ -872,6 +887,164 @@ struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, }; +int icu_chain_step_next_token(struct icu_chain * chain, + struct icu_chain_step * step, + UErrorCode *status) +{ + struct icu_buf_utf16 * src16 = 0; + + printf("icu_chain_step_next_token %d\n", (int) step); + + if (!chain || !chain->src16 || !step || !step->more_tokens) + return 0; + + // assign utf16 src buffers as neeed, advance in previous steps + // tokens, and setting stop condition + if (step->previous){ + src16 = step->previous->buf16; + step->more_tokens + = icu_chain_step_next_token(chain, step->previous, status); + } + else { // first step can only work once on chain->src16 input buffer + src16 = chain->src16; + step->more_tokens = 1; + } + + // stop if nothing to process + // i.e new token source was not properly assigned + if (!step->more_tokens || !src16 || !src16->utf16_len) // + return 0; + + printf("icu_chain_step_next_token %d working\n", (int) step); + + + // perform the work, eventually put this steps output in + // step->buf16 or the chains UTF8 output buffers + switch(step->type) { + case ICU_chain_step_type_display: + icu_utf16_to_utf8(chain->display8, src16, status); + break; + case ICU_chain_step_type_norm: + icu_utf16_to_utf8(chain->norm8, src16, status); + break; + case ICU_chain_step_type_sort: + icu_utf16_to_utf8(chain->sort8, src16, status); + break; + case ICU_chain_step_type_charmap: + break; + case ICU_chain_step_type_normalize: + icu_normalizer_normalize(step->u.normalizer, + step->buf16, src16, status); + break; + case ICU_chain_step_type_tokenize: + // step->more_tokens + // = icu_tokenizer_next_token(step->u.tokenizer, + // step->buf16, status); + break; + default: + return 0; + break; + } + + + // stop further token processing if last step + if (!step->previous) + step->more_tokens = 0; + + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + + + +int icu_chain_assign_cstr(struct icu_chain * chain, + const char * src8cstr, + UErrorCode *status) +{ + struct icu_chain_step * stp = chain->steps; + + if (!chain || !src8cstr) + return 0; + + // clear token count + chain->token_count = 0; + + // clear all steps stop states + + while (stp){ + stp->more_tokens = 1; + stp = stp->previous; + } + + // finally convert UTF8 to UTF16 string + icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status); + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + + + +int icu_chain_next_token(struct icu_chain * chain, + UErrorCode *status) +{ + int success = 0; + + if (!chain || !chain->steps) + return 0; + + success = icu_chain_step_next_token(chain, chain->steps, status); + + if (success){ + chain->token_count++; + return chain->token_count; + } + + return 0; +}; + +int icu_chain_get_token_count(struct icu_chain * chain) +{ + if (!chain) + return 0; + + return chain->token_count; +}; + + + +const char * icu_chain_get_display(struct icu_chain * chain) +{ + if (chain->display8) + return (const char *) chain->display8->utf8; + + return 0; +}; + +const char * icu_chain_get_norm(struct icu_chain * chain) +{ + if (chain->norm8) + return (const char *) chain->norm8->utf8; + + return 0; +}; + +const char * icu_chain_get_sort(struct icu_chain * chain) +{ + if (chain->sort8) + return (const char *) chain->sort8->utf8; + + return 0; +}; + + + + #endif // HAVE_ICU diff --git a/src/icu_I18N.h b/src/icu_I18N.h index ff6bf31..91c356a 100644 --- a/src/icu_I18N.h +++ b/src/icu_I18N.h @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.h,v 1.13 2007-05-15 15:11:42 marc Exp $ +/* $Id: icu_I18N.h,v 1.14 2007-05-16 12:39:49 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -186,9 +186,9 @@ struct icu_chain_step struct icu_tokenizer * tokenizer; } u; // temprary post-action utf16 buffer - struct icu_buf_utf16 * src16; + struct icu_buf_utf16 * buf16; struct icu_chain_step * previous; - int end_of_tokens; + int more_tokens; }; @@ -197,7 +197,7 @@ struct icu_chain; struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, enum icu_chain_step_type type, const uint8_t * rule, - struct icu_buf_utf16 * src16, + struct icu_buf_utf16 * buf16, UErrorCode *status); @@ -226,7 +226,6 @@ struct icu_chain struct icu_chain * icu_chain_create(const uint8_t * identifier, const uint8_t * locale); - void icu_chain_destroy(struct icu_chain * chain); struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, @@ -235,6 +234,27 @@ struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, UErrorCode *status); +int icu_chain_step_next_token(struct icu_chain * chain, + struct icu_chain_step * step, + UErrorCode *status); + +int icu_chain_assign_cstr(struct icu_chain * chain, + const char * src8cstr, + UErrorCode *status); + +int icu_chain_next_token(struct icu_chain * chain, + UErrorCode *status); + +int icu_chain_get_token_count(struct icu_chain * chain); + +const char * icu_chain_get_display(struct icu_chain * chain); + +const char * icu_chain_get_norm(struct icu_chain * chain); + +const char * icu_chain_get_sort(struct icu_chain * chain); + + + #endif // HAVE_ICU diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index 9e87f38..e6214d9 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.18 2007-05-11 22:59:36 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.19 2007-05-16 12:39:49 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -317,6 +317,67 @@ void test_icu_I18N_sortmap(int argc, char **argv) // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + + + +int test_icu_transliterator(const char * rules8cstr, + const char * src8cstr, + const char * chk8cstr) +{ + int success = 0; + + UErrorCode status = U_ZERO_ERROR; + UParseError parse_error[256]; + + + struct icu_buf_utf16 * rules16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0); + struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0); + + icu_utf16_from_utf8_cstr(rules16, rules8cstr, &status); + icu_check_status(status); + + icu_utf16_from_utf8_cstr(src16, src8cstr, &status); + icu_check_status(status); + + + struct icu_normalizer * normalizer + = icu_normalizer_create((const char *) rules16, 'f', &status); + icu_check_status(status); + + icu_normalizer_normalize(normalizer, dest16, src16, &status); + + + icu_utf16_to_utf8(dest8, src16, &status); + icu_check_status(status); + + + if(!strcmp((const char *) dest8->utf8, + (const char *) chk8cstr)) + success = 1; + else { + success = 0; + printf("Normaliozation;"); + printf("Rules: '%s'\n", rules8cstr); + printf("Input: '%s'\n", src8cstr); + printf("Normalized: '%s'\n", dest8->utf8); + printf("Expected: '%s'\n", chk8cstr); + } + + + icu_normalizer_destroy(normalizer); + icu_buf_utf16_destroy(rules16); + icu_buf_utf16_destroy(src16); + icu_buf_utf16_destroy(dest16); + icu_buf_utf8_destroy(dest8); + + return success; +} + + +#if 0 + int test_icu_transliterator(const char * rules8cstr, const char * src8cstr, const char * chk8cstr) @@ -355,7 +416,7 @@ int test_icu_transliterator(const char * rules8cstr, rules8cstr); } - utrans_transUChars (trans, src16->utf16, &(src16->utf16_len), + utrans_transUChars(trans, src16->utf16, &(src16->utf16_len), src16->utf16_cap, 0, &(src16->utf16_len), &status); @@ -385,7 +446,6 @@ int test_icu_transliterator(const char * rules8cstr, return success; } -#if 0 printf("\n\nUnicode Set Patterns:\n" " Pattern Description\n" " Ranges [a-z] The lower case letters a through z\n" @@ -563,6 +623,52 @@ void test_icu_I18N_tokenizer(int argc, char **argv) } +void test_icu_I18N_chain(int argc, char **argv) +{ + const char * en_str + = "O Romeo, Romeo! wherefore art thou Romeo?"; + + UErrorCode status = U_ZERO_ERROR; + struct icu_chain_step * step = 0; + struct icu_chain * chain + = icu_chain_create((uint8_t *) "en:sentence", (uint8_t *) "en"); +/* step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */ +/* (const uint8_t *) "[:Control:] Any-Remove", */ +/* &status); */ +/* step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, */ +/* (const uint8_t *) "w", */ +/* &status); */ +/* step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */ +/* (const uint8_t *) */ +/* "[[:WhiteSpace:][:Punctuation:]] Any-Remove", */ +/* &status); */ + step = icu_chain_insert_step(chain, ICU_chain_step_type_display, + (const uint8_t *)"", + &status); + step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, + (const uint8_t *) "Lower", + &status); + step = icu_chain_insert_step(chain, ICU_chain_step_type_norm, + (const uint8_t *)"", + &status); +/* step = icu_chain_insert_step(chain, ICU_chain_step_type_sort, */ +/* (const uint8_t *)"", */ +/* &status); */ + + + + + YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status)); + + while (icu_chain_next_token(chain, &status)){ + printf("token %d norm: '%s' display: '%s'\n", + icu_chain_get_token_count(chain), + icu_chain_get_norm(chain), + icu_chain_get_display(chain)); + } + + icu_chain_destroy(chain); +} @@ -581,8 +687,9 @@ int main(int argc, char **argv) //test_icu_I18N_casemap_failures(argc, argv); test_icu_I18N_casemap(argc, argv); test_icu_I18N_sortmap(argc, argv); - test_icu_I18N_transliterator(argc, argv); + //test_icu_I18N_transliterator(argc, argv); test_icu_I18N_tokenizer(argc, argv); + //test_icu_I18N_chain(argc, argv); #else // HAVE_ICU -- 1.7.10.4