-/* $Id: icu_I18N.h,v 1.1 2007-10-22 12:21:39 adam Exp $
- Copyright (c) 2006-2007, Index Data.
-
- This file is part of Pazpar2.
-
- Pazpar2 is free software; you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the Free
- Software Foundation; either version 2, or (at your option) any later
- version.
-
- Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- for more details.
-
- You should have received a copy of the GNU General Public License
- along with Pazpar2; see the file LICENSE. If not, write to the
- Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
-*/
+/*
+ * Copyright (c) 1995-2007, Index Data
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Index Data nor the names of its contributors
+ * may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
#ifndef ICU_I18NL_H
#define ICU_I18NL_H
#include <unicode/utypes.h> /* Basic ICU data types */
#include <unicode/uchar.h> /* char names */
-//#include <unicode/ustdio.h>
#include <unicode/ucol.h>
-//#include <unicode/ucnv.h> /* C Converter API */
-//#include <unicode/ustring.h> /* some more string fcns*/
-//#include <unicode/uloc.h>
#include <unicode/ubrk.h>
-//#include <unicode/unistr.h>
#include <unicode/utrans.h>
-// declared structs and functions
+/* declared structs and functions */
int icu_check_status (UErrorCode status);
struct icu_buf_utf8
{
- uint8_t * utf8;
- int32_t utf8_len;
- int32_t utf8_cap;
+ uint8_t * utf8;
+ int32_t utf8_len;
+ int32_t utf8_cap;
};
struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
struct icu_casemap
{
- char locale[16];
- char action;
+ char locale[16];
+ char action;
};
struct icu_casemap * icu_casemap_create(const char *locale, char action,
struct icu_tokenizer
{
- char locale[16];
- char action;
- UBreakIterator* bi;
- struct icu_buf_utf16 * buf16;
- int32_t token_count;
- int32_t token_id;
- int32_t token_start;
- int32_t token_end;
- // keep always invariant
- // 0 <= token_start
- // <= token_end
- // <= buf16->utf16_len
- // and invariant
- // 0 <= token_id <= token_count
+ char locale[16];
+ char action;
+ UBreakIterator* bi;
+ struct icu_buf_utf16 * buf16;
+ int32_t token_count;
+ int32_t token_id;
+ int32_t token_start;
+ int32_t token_end;
+/*
+ keep always invariant
+ 0 <= token_start
+ <= token_end
+ <= buf16->utf16_len
+ and invariant
+ 0 <= token_id <= token_count
+*/
};
struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
struct icu_normalizer
{
- char action;
- struct icu_buf_utf16 * rules16;
- UParseError parse_error[256];
- UTransliterator * trans;
+ char action;
+ struct icu_buf_utf16 * rules16;
+ UParseError parse_error[256];
+ UTransliterator * trans;
};
struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
struct icu_buf_utf16 * src16,
UErrorCode *status);
-
-#if 0
-struct icu_token
-{
- int32_t token_id;
- uint8_t * display8;
- uint8_t * norm8;
- uint8_t * sort8;
-}
-#endif
-
-
enum icu_chain_step_type {
- ICU_chain_step_type_none, //
- ICU_chain_step_type_display, // convert to utf8 display format
- ICU_chain_step_type_index, // convert to utf8 index format
- ICU_chain_step_type_sortkey, // convert to utf8 sortkey format
- ICU_chain_step_type_casemap, // apply utf16 charmap
- ICU_chain_step_type_normalize, // apply utf16 normalization
- ICU_chain_step_type_tokenize // apply utf16 tokenization
+ ICU_chain_step_type_none,
+ ICU_chain_step_type_display, /* convert to utf8 display format */
+ ICU_chain_step_type_index, /* convert to utf8 index format */
+ ICU_chain_step_type_sortkey, /* convert to utf8 sortkey format */
+ ICU_chain_step_type_casemap, /* apply utf16 charmap */
+ ICU_chain_step_type_normalize, /* apply utf16 normalization */
+ ICU_chain_step_type_tokenize /* apply utf16 tokenization */
};
struct icu_chain_step
{
- // type and action object
- enum icu_chain_step_type type;
- union {
- struct icu_casemap * casemap;
- struct icu_normalizer * normalizer;
- struct icu_tokenizer * tokenizer;
- } u;
- // temprary post-action utf16 buffer
- struct icu_buf_utf16 * buf16;
- struct icu_chain_step * previous;
- int more_tokens;
- int need_new_token;
+ /* type and action object */
+ enum icu_chain_step_type type;
+ union {
+ struct icu_casemap * casemap;
+ struct icu_normalizer * normalizer;
+ struct icu_tokenizer * tokenizer;
+ } u;
+ /* temprary post-action utf16 buffer */
+ struct icu_buf_utf16 * buf16;
+ struct icu_chain_step * previous;
+ int more_tokens;
+ int need_new_token;
};
struct icu_chain
{
- uint8_t identifier[128];
- uint8_t locale[16];
-
- // number of tokens returned so far
- int32_t token_count;
-
- // utf8 output buffers
- struct icu_buf_utf8 * display8;
- struct icu_buf_utf8 * norm8;
- struct icu_buf_utf8 * sort8;
-
- // utf16 source buffer
- struct icu_buf_utf16 * src16;
-
- // linked list of chain steps
- struct icu_chain_step * steps;
+ uint8_t identifier[128];
+ uint8_t locale[16];
+
+ /* number of tokens returned so far */
+ int32_t token_count;
+
+ /* utf8 output buffers */
+ struct icu_buf_utf8 * display8;
+ struct icu_buf_utf8 * norm8;
+ struct icu_buf_utf8 * sort8;
+
+ /* utf16 source buffer */
+ struct icu_buf_utf16 * src16;
+
+ /* linked list of chain steps */
+ struct icu_chain_step * steps;
};
struct icu_chain * icu_chain_create(const uint8_t * identifier,
const char * icu_chain_get_sort(struct icu_chain * chain);
+#endif /* ICU_I18NL_H */
-
-
-
-#endif // ICU_I18NL_H
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
-/* $Id: icu_I18N.c,v 1.1 2007-10-22 12:21:39 adam Exp $
- Copyright (c) 2006-2007, Index Data.
-
- This file is part of Pazpar2.
-
- Pazpar2 is free software; you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the Free
- Software Foundation; either version 2, or (at your option) any later
- version.
-
- Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- for more details.
-
- You should have received a copy of the GNU General Public License
- along with Pazpar2; see the file LICENSE. If not, write to the
- Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
-*/
+/*
+ * Copyright (C) 1995-2007, Index Data ApS
+ * See the file LICENSE for details.
+ *
+ * $Id: icu_I18N.c,v 1.2 2007-10-22 17:32:07 adam Exp $
+ */
#if HAVE_CONFIG_H
-#include "cconfig.h"
+#include "config.h"
#endif
#define USE_TIMING 0
#endif
-#ifdef HAVE_ICU
+#if HAVE_ICU
#include <yaz/icu_I18N.h>
#include <yaz/log.h>
#include <unicode/uchar.h> /* char names */
-//#include <unicode/ustdio.h>
-//#include <unicode/utypes.h> /* Basic ICU data types */
#include <unicode/ucol.h>
-//#include <unicode/ucnv.h> /* C Converter API */
-//#include <unicode/uloc.h>
-//#include <unicode/ubrk.h>
-/* #include <unicode/unistr.h> */
-
-
int icu_check_status (UErrorCode status)
buf16->utf16_cap = capacity;
}
return buf16;
-};
+}
struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
size_t capacity)
}
return buf16;
-};
+}
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
dest16->utf16_len = src16->utf16_len;
return dest16;
-};
+}
void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
free(buf16->utf16);
free(buf16);
}
-};
+}
buf8->utf8_cap = capacity;
}
return buf8;
-};
+}
}
return buf8;
-};
+}
struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
return dest8;
-};
+}
const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
free(buf8->utf8);
free(buf8);
}
-};
+}
&utf16_len,
(const char *) src8->utf8, src8->utf8_len, status);
- // check for buffer overflow, resize and retry
- if (*status == U_BUFFER_OVERFLOW_ERROR
- //|| dest16->utf16_len > dest16->utf16_cap
- ){
+ /* check for buffer overflow, resize and retry */
+ if (*status == U_BUFFER_OVERFLOW_ERROR)
+ {
icu_buf_utf16_resize(dest16, utf16_len * 2);
*status = U_ZERO_ERROR;
u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
(const char *) src8->utf8, src8->utf8_len, status);
}
- //if (*status != U_BUFFER_OVERFLOW_ERROR
if (U_SUCCESS(*status)
&& utf16_len <= dest16->utf16_cap)
dest16->utf16_len = utf16_len;
}
return *status;
-};
+}
&utf16_len,
src8cstr, src8cstr_len, status);
- // check for buffer overflow, resize and retry
- if (*status == U_BUFFER_OVERFLOW_ERROR
- //|| dest16->utf16_len > dest16->utf16_cap
- ){
+ /* check for buffer overflow, resize and retry */
+ if (*status == U_BUFFER_OVERFLOW_ERROR)
+ {
icu_buf_utf16_resize(dest16, utf16_len * 2);
*status = U_ZERO_ERROR;
u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
src8cstr, src8cstr_len, status);
}
- // if (*status != U_BUFFER_OVERFLOW_ERROR
if (U_SUCCESS(*status)
&& utf16_len <= dest16->utf16_cap)
dest16->utf16_len = utf16_len;
}
return *status;
-};
+}
&utf8_len,
src16->utf16, src16->utf16_len, status);
- // check for buffer overflow, resize and retry
- if (*status == U_BUFFER_OVERFLOW_ERROR
- //|| dest8->utf8_len > dest8->utf8_cap
- ){
+ /* check for buffer overflow, resize and retry */
+ if (*status == U_BUFFER_OVERFLOW_ERROR)
+ {
icu_buf_utf8_resize(dest8, utf8_len * 2);
*status = U_ZERO_ERROR;
u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
}
- //if (*status != U_BUFFER_OVERFLOW_ERROR
if (U_SUCCESS(*status)
&& utf8_len <= dest8->utf8_cap)
dest8->utf8_len = utf8_len;
}
return *status;
-};
+}
}
return casemap;
-};
+}
void icu_casemap_destroy(struct icu_casemap * casemap)
{
if (casemap)
free(casemap);
-};
+}
int icu_casemap_casemap(struct icu_casemap * casemap,
return icu_utf16_casemap(dest16, src16,
casemap->locale, casemap->action, status);
-};
+}
int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
break;
}
- // check for buffer overflow, resize and retry
+ /* check for buffer overflow, resize and retry */
if (*status == U_BUFFER_OVERFLOW_ERROR
- && dest16 != src16 // do not resize if in-place conversion
- //|| dest16_len > dest16->utf16_cap
+ && dest16 != src16 /* do not resize if in-place conversion */
){
icu_buf_utf16_resize(dest16, dest16_len * 2);
*status = U_ZERO_ERROR;
}
return *status;
-};
+}
sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
dest8->utf8, dest8->utf8_cap);
- // check for buffer overflow, resize and retry
+ /* check for buffer overflow, resize and retry */
if (sortkey_len > dest8->utf8_cap) {
icu_buf_utf8_resize(dest8, sortkey_len * 2);
sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
}
return sortkey_len;
-};
+}
break;
}
- // ICU error stuff is a very funny business
+ /* ICU error stuff is a very funny business */
if (U_SUCCESS(*status))
return tokenizer;
- // freeing if failed
+ /* freeing if failed */
icu_tokenizer_destroy(tokenizer);
return 0;
-};
+}
void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
{
ubrk_close(tokenizer->bi);
free(tokenizer);
}
-};
+}
int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
struct icu_buf_utf16 * src16,
|| !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
return 0;
- // never change tokenizer->buf16 and keep always invariant
- // 0 <= tokenizer->token_start
- // <= tokenizer->token_end
- // <= tokenizer->buf16->utf16_len
- // returns length of token
+ /*
+ never change tokenizer->buf16 and keep always invariant
+ 0 <= tokenizer->token_start
+ <= tokenizer->token_end
+ <= tokenizer->buf16->utf16_len
+ returns length of token
+ */
- if (0 == tokenizer->token_end) // first call
+ if (0 == tokenizer->token_end) /* first call */
tkn_start = ubrk_first(tokenizer->bi);
- else //successive calls
+ else /* successive calls */
tkn_start = tokenizer->token_end;
- // get next position
+ /* get next position */
tkn_end = ubrk_next(tokenizer->bi);
- // repairing invariant at end of ubrk, which is UBRK_DONE = -1
+ /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
if (UBRK_DONE == tkn_end)
tkn_end = tokenizer->buf16->utf16_len;
- // copy out if everything is well
+ /* copy out if everything is well */
if(U_FAILURE(*status))
return 0;
- // everything OK, now update internal state
+ /* everything OK, now update internal state */
tkn_len = tkn_end - tkn_start;
if (0 < tkn_len){
tokenizer->token_end = tkn_end;
- // copying into token buffer if it exists
+ /* copying into token buffer if it exists */
if (tkn16){
if (tkn16->utf16_cap < tkn_len)
icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
{
return tokenizer->token_id;
-};
+}
int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
{
return tokenizer->token_start;
-};
+}
int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
{
return tokenizer->token_end;
-};
+}
int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
{
return (tokenizer->token_end - tokenizer->token_start);
-};
+}
int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
{
return tokenizer->token_count;
-};
+}
UTRANS_FORWARD,
0, 0,
normalizer->parse_error, status);
- // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
break;
case 'r':
normalizer->trans
UTRANS_REVERSE ,
0, 0,
normalizer->parse_error, status);
- // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
break;
default:
*status = U_UNSUPPORTED_ERROR;
if (U_SUCCESS(*status))
return normalizer;
- // freeing if failed
+ /* freeing if failed */
icu_normalizer_destroy(normalizer);
return 0;
-};
+}
void icu_normalizer_destroy(struct icu_normalizer * normalizer){
icu_buf_utf16_destroy(normalizer->rules16);
if (normalizer->trans)
{
- // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans);
utrans_close(normalizer->trans);
}
free(normalizer);
}
-};
+}
step->buf16 = buf16;
- // create auxilary objects
+ /* create auxilary objects */
switch(step->type) {
case ICU_chain_step_type_display:
break;
}
return step;
-};
+}
void icu_chain_step_destroy(struct icu_chain_step * step){
break;
}
free(step);
-};
+}
chain->steps = 0;
return chain;
-};
+}
void icu_chain_destroy(struct icu_chain * chain)
icu_chain_step_destroy(chain->steps);
free(chain);
}
-};
+}
|| strcmp((const char *) xml_node->name, "icu_chain"))
return 0;
-
- xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
- xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
-
- if (!xml_id || !strlen((const char *) xml_id)
- || !xml_locale || !strlen((const char *) xml_locale))
- return 0;
- chain = icu_chain_create((const uint8_t *) xml_id,
- (const uint8_t *) xml_locale);
-
- xmlFree(xml_id);
- xmlFree(xml_locale);
+ {
+ xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
+ xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
+
+ if (!xml_id || !strlen((const char *) xml_id)
+ || !xml_locale || !strlen((const char *) xml_locale))
+ return 0;
+
+ chain = icu_chain_create((const uint8_t *) xml_id,
+ (const uint8_t *) xml_locale);
+
+ xmlFree(xml_id);
+ xmlFree(xml_locale);
+ }
if (!chain)
return 0;
for (node = xml_node->children; node; node = node->next)
{
+ xmlChar *xml_rule;
+ struct icu_chain_step * step = 0;
+
if (node->type != XML_ELEMENT_NODE)
continue;
- xmlChar *xml_rule = xmlGetProp(node, (xmlChar *) "rule");
- struct icu_chain_step * step = 0;
+ xml_rule = xmlGetProp(node, (xmlChar *) "rule");
if (!strcmp((const char *) node->name,
(const char *) "casemap")){
}
return chain;
-};
+}
if (!chain || !type || !rule)
return 0;
- // assign utf16 src buffers as needed
+ /* assign utf16 src buffers as needed */
if (chain->steps && chain->steps->buf16)
src16 = chain->steps->buf16;
else if (chain->src16)
return 0;
- // create utf16 destination buffers as needed, or
+ /* create utf16 destination buffers as needed, or */
switch(type) {
case ICU_chain_step_type_display:
buf16 = src16;
break;
}
- // create actual chain step with this buffer
+ /* create actual chain step with this buffer */
step = icu_chain_step_create(chain, type, rule, buf16, status);
step->previous = chain->steps;
chain->steps = step;
return step;
-};
+}
int icu_chain_step_next_token(struct icu_chain * chain,
{
struct icu_buf_utf16 * src16 = 0;
- //printf("icu_chain_step_next_token %d\n", (int) step);
-
if (!chain || !chain->src16 || !step || !step->more_tokens)
return 0;
- // assign utf16 src buffers as neeed, advance in previous steps
- // tokens until non-zero token met, and setting stop condition
+ /* assign utf16 src buffers as neeed, advance in previous steps
+ tokens until non-zero token met, and setting stop condition
+ */
if (step->previous){
src16 = step->previous->buf16;
if (step->need_new_token)
- //while (step->more_tokens && !src16->utf16_len)
- step->more_tokens
- = icu_chain_step_next_token(chain, step->previous, status);
+ step->more_tokens
+ = icu_chain_step_next_token(chain, step->previous, status);
}
- else { // first step can only work once on chain->src16 input buffer
+ else { /* first step can only work once on chain->src16 input buffer */
src16 = chain->src16;
step->more_tokens = 1;
}
- // stop if nothing to process
- // i.e new token source was not properly assigned
- if (!step->more_tokens || !src16) // || !src16->utf16_len
+ /* stop if nothing to process
+ i.e new token source was not properly assigned
+ */
+ if (!step->more_tokens || !src16)
return 0;
- //printf("icu_chain_step_next_token %d working\n", (int) step);
-
-
- // perform the work, eventually put this steps output in
- // step->buf16 or the chains UTF8 output buffers
+ /* perform the work, eventually put this steps output in
+ step->buf16 or the chains UTF8 output buffers */
switch(step->type) {
case ICU_chain_step_type_display:
icu_utf16_to_utf8(chain->display8, src16, status);
step->buf16, src16, status);
break;
case ICU_chain_step_type_tokenize:
- // attach to new src16 token only first time during splitting
+ /* attach to new src16 token only first time during splitting */
if (step->need_new_token){
icu_tokenizer_attach(step->u.tokenizer, src16, status);
step->need_new_token = 0;
}
- // splitting one src16 token into multiple buf16 tokens
+ /* splitting one src16 token into multiple buf16 tokens */
step->more_tokens
= icu_tokenizer_next_token(step->u.tokenizer,
step->buf16, status);
- // make sure to get new previous token if this one had been used up
+ /* make sure to get new previous token if this one had been used up */
if (step->previous && !step->more_tokens){
if (icu_chain_step_next_token(chain, step->previous, status)){
icu_tokenizer_attach(step->u.tokenizer, src16, status);
- // stop further token processing if last step and
- // new tokens are needed from previous (non-existing) step
+ /* stop further token processing if last step and
+ new tokens are needed from previous (non-existing) step
+ */
if (!step->previous && step->need_new_token)
step->more_tokens = 0;
- //printf("%d %d %d\n",
- // step->more_tokens, src16->utf16_len, step->buf16->utf16_len);
-
-
if (U_FAILURE(*status))
return 0;
return 1;
-};
+}
stp = chain->steps;
- // clear token count
+ /* clear token count */
chain->token_count = 0;
- // clear all steps stop states
-
+ /* clear all steps stop states */
while (stp){
stp->more_tokens = 1;
stp->need_new_token = 1;
stp = stp->previous;
}
- // finally convert UTF8 to UTF16 string
+ /* finally convert UTF8 to UTF16 string */
icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status);
if (U_FAILURE(*status))
return 0;
return 1;
-};
+}
}
return 0;
-};
+}
int icu_chain_get_token_count(struct icu_chain * chain)
{
return 0;
return chain->token_count;
-};
+}
return icu_buf_utf8_to_cstr(chain->display8);
return 0;
-};
+}
const char * icu_chain_get_norm(struct icu_chain * chain)
{
return icu_buf_utf8_to_cstr(chain->norm8);
return 0;
-};
+}
const char * icu_chain_get_sort(struct icu_chain * chain)
{
return icu_buf_utf8_to_cstr(chain->sort8);
return 0;
-};
-
-
+}
-#endif // HAVE_ICU
+#endif /* HAVE_ICU */
-/* $Id: tst_icu_I18N.c,v 1.1 2007-10-22 12:21:39 adam Exp $
+/* $Id: tst_icu_I18N.c,v 1.2 2007-10-22 17:32:07 adam Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
02111-1307, USA.
*/
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
#if HAVE_CONFIG_H
-#include "cconfig.h"
+#include "config.h"
#endif
#define USE_TIMING 0
#include <yaz/test.h>
-
-
-#ifdef HAVE_ICU
+#if HAVE_ICU
#include <yaz/icu_I18N.h>
#include <string.h>
#include <stdlib.h>
-//#include <unicode/ustring.h>
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
#define MAX_KEY_SIZE 256
struct icu_termmap
{
- uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
- char disp_term[MAX_KEY_SIZE]; // standard C utf-8 string
+ uint8_t sort_key[MAX_KEY_SIZE]; /* standard C string '\0' terminated */
+ char disp_term[MAX_KEY_SIZE]; /* standard C utf-8 string */
};
int src8cstr_len = strlen(src8cstr);
int chk8cstr_len = strlen(chk8cstr);
- // converting to UTF16
+ /* converting to UTF16 */
icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
- // perform case mapping
+ /* perform case mapping */
icu_utf16_casemap(dest16, src16, locale, action, &status);
- // converting to UTF8
+ /* converting to UTF8 */
icu_utf16_to_utf8(dest8, dest16, &status);
- // determine success
+ /* determine success */
if (dest8->utf8
&& (dest8->utf8_len == strlen(chk8cstr))
&& !strcmp(chk8cstr, (const char *) dest8->utf8))
else
success = 0;
- // report failures
+ /* report failures */
if (!success){
printf("\nERROR\n");
printf("original string: '%s' (%d)\n", src8cstr, src8cstr_len);
printf("expected string: '%s' (%d)\n", chk8cstr, chk8cstr_len);
}
- // clean the buffers
+ /* clean the buffers */
icu_buf_utf8_destroy(src8);
icu_buf_utf8_destroy(dest8);
icu_buf_utf16_destroy(src16);
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
void test_icu_I18N_casemap(int argc, char **argv)
{
- // Locale 'en'
+ /* Locale 'en' */
- // sucessful tests
+ /* successful tests */
YAZ_CHECK(test_icu_casemap("en", 'l',
"A ReD fOx hunTS sQUirriLs",
"a red fox hunts squirrils"));
"A Red Fox Hunts Squirrils"));
- // Locale 'da'
+ /* Locale 'da' */
- // sucess expected
+ /* success expected */
YAZ_CHECK(test_icu_casemap("da", 'l',
"åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
"åh æble, øs fløde i åen efter blåbærgrøden"));
"åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
"Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
- // Locale 'de'
+ /* Locale 'de' */
- // sucess expected
+ /* success expected */
YAZ_CHECK(test_icu_casemap("de", 'l',
"zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
"zwölf ärgerliche würste rollen über die straße"));
}
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
int test_icu_sortmap(const char * locale, int src_list_len,
const char ** src_list, const char ** chk_list)
if(U_FAILURE(status))
return 0;
- // assigning display terms and sort keys using buf 8 and buf16
+ /* assigning display terms and sort keys using buf 8 and buf16 */
for( i = 0; i < src_list_len; i++)
{
list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
- // copy display term
+ /* copy display term */
strcpy(list[i]->disp_term, src_list[i]);
- // transforming to UTF16
+ /* transforming to UTF16 */
icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
icu_check_status(status);
- // computing sortkeys
+ /* computing sortkeys */
icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
icu_check_status(status);
- // assigning sortkeys
+ /* assigning sortkeys */
memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
- //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
- //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
}
- // do the sorting
+ /* do the sorting */
qsort(list, src_list_len,
sizeof(struct icu_termmap *), icu_termmap_cmp);
- // checking correct sorting
+ /* checking correct sorting */
for (i = 0; i < src_list_len; i++){
if (0 != strcmp(list[i]->disp_term, chk_list[i])){
success = 0;
printf("ICU sort: '%s' : ", locale);
for (i = 0; i < src_list_len; i++) {
printf(" '%s'", list[i]->disp_term);
- //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
}
printf("\n");
printf("Expected: '%s' : ", locale);
}
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
void test_icu_I18N_sortmap(int argc, char **argv)
{
- // sucessful tests
+ /* successful tests */
size_t en_1_len = 6;
const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
- // sucessful tests
- size_t da_1_len = 6;
- const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
- const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
- YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
- YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
-
- // sucessful tests
- size_t de_1_len = 9;
- const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
- const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
- YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
- YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
- YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
+ /* successful tests */
+ {
+ size_t da_1_len = 6;
+ const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
+ const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
+ YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
+ YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
+ }
+ /* successful tests */
+ {
+ size_t de_1_len = 9;
+ const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
+ const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
+ YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
+ YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
+ YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
+ }
}
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
};
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
void test_icu_I18N_normalizer(int argc, char **argv)
{
}
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
int test_icu_tokenizer(const char * locale, char action,
const char * src8cstr, int count)
struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
+ struct icu_tokenizer * tokenizer = 0;
- //printf("Input: '%s'\n", src8cstr);
-
- // transforming to UTF16
+ /* transforming to UTF16 */
icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
icu_check_status(status);
- // set up tokenizer
- struct icu_tokenizer * tokenizer
- = icu_tokenizer_create(locale, action, &status);
+ /* set up tokenizer */
+ tokenizer = icu_tokenizer_create(locale, action, &status);
icu_check_status(status);
YAZ_CHECK(tokenizer);
- // attach text buffer to tokenizer
+ /* attach text buffer to tokenizer */
icu_tokenizer_attach(tokenizer, src16, &status);
icu_check_status(status);
YAZ_CHECK(tokenizer->bi);
- // perform work on tokens
- //printf("Tokens: ");
+ /* perform work on tokens */
while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
icu_check_status(status);
- // converting to UTF8
+ /* converting to UTF8 */
icu_utf16_to_utf8(tkn8, tkn16, &status);
-
- //printf("token %d %d %d %d '%s'\n",
- //
- // icu_tokenizer_token_start(tokenizer),
- // icu_tokenizer_token_end(tokenizer),
- // icu_tokenizer_token_length(tokenizer),
- // tkn8->utf8);
}
if (count != icu_tokenizer_token_count(tokenizer)){
}
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
void test_icu_I18N_tokenizer(int argc, char **argv)
{
- const char * da_str
- = "Blåbærtærte. Denne kage stammer fra Finland. "
- "Den er med blåbær, men alle sommerens forskellige bær kan bruges.";
-
- YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3));
- YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17));
- YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37));
- YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110));
+ {
+ const char * da_str
+ = "Blåbærtærte. Denne kage stammer fra Finland. "
+ "Den er med blåbær, men alle sommerens forskellige bær kan bruges.";
+
+ YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3));
+ YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17));
+ YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37));
+ YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110));
+ }
}
const char * en_str
= "O Romeo, Romeo! wherefore art thou\t Romeo?";
- printf("ICU chain:\ninput: '%s'\n", en_str);
-
UErrorCode status = U_ZERO_ERROR;
- //struct icu_chain_step * step = 0;
struct icu_chain * chain = 0;
xmlNode *xml_node = xmlDocGetRootElement(doc);
YAZ_CHECK(xml_node);
+ printf("ICU chain:\ninput: '%s'\n", en_str);
+
chain = icu_chain_xml_config(xml_node, &status);
void test_bug_1140(void)
{
- const char * en_str
- = "O Romeo, Romeo! wherefore art thou\t Romeo?";
-
- printf("ICU chain:\ninput: '%s'\n", en_str);
-
UErrorCode status = U_ZERO_ERROR;
- //struct icu_chain_step * step = 0;
struct icu_chain * chain = 0;
const char * xml_str = "<icu_chain id=\"en:word\" locale=\"en\">"
#endif // HAVE_ICU
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
int main(int argc, char **argv)
{
YAZ_CHECK_INIT(argc, argv);
YAZ_CHECK_LOG();
-#ifdef HAVE_ICU
+#if HAVE_ICU
- //test_icu_I18N_casemap_failures(argc, argv);
test_icu_I18N_casemap(argc, argv);
test_icu_I18N_sortmap(argc, argv);
test_icu_I18N_normalizer(argc, argv);
test_icu_I18N_chain(argc, argv);
test_bug_1140();
-#else // HAVE_ICU
+#else /* HAVE_ICU */
printf("ICU unit tests omitted.\n"
"Please install libicu36-dev and icu-doc or similar\n");
YAZ_CHECK(0 == 0);
-#endif // HAVE_ICU
+#endif /* HAVE_ICU */
YAZ_CHECK_TERM;
}
-// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */
-/* $Id: yaz-icu.c,v 1.1 2007-10-22 12:21:40 adam Exp $
- Copyright (c) 2006-2007, Index Data.
-
-This file is part of Pazpar2.
-
-Pazpar2 is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 2, or (at your option) any later
-version.
-
-Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received a copy of the GNU General Public License
-along with Pazpar2; see the file LICENSE. If not, write to the
-Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-02111-1307, USA.
+/*
+ * Copyright (C) 1995-2007, Index Data ApS
+ * See the file LICENSE for details.
+ *
+ * $Id: yaz-icu.c,v 1.2 2007-10-22 17:32:08 adam Exp $
*/
#if HAVE_CONFIG_H
-#include "cconfig.h"
+#include "config.h"
#endif
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
-//#include <yaz/xmalloc.h>
#include <yaz/options.h>
-#ifdef HAVE_ICU
+#if HAVE_ICU
#include <unicode/ucnv.h>
#include <unicode/ustring.h>
|| !config.outfile)
print_option_error(p_config);
-};
+}
/* UConverter *conv; */
static void print_icu_transliterators(const struct config_t *p_config)
{
- int32_t count;
- int32_t i;
-
- count = utrans_countAvailableIDs();
-
int32_t buf_cap = 128;
char buf[buf_cap];
+ int32_t i;
+ int32_t count = utrans_countAvailableIDs();
if (p_config->xmloutput)
fprintf(config.outfile, "<transliterators count=\"%d\">\n", count);
fprintf(config.outfile, "</icu>\n");
exit(0);
-};
+}
"<icu>\n"
"<tokens>\n");
- // read input lines for processing
+ /* read input lines for processing */
while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
{
success = icu_chain_assign_cstr(config.chain, line, &status);
xmlFreeDoc(doc);
if (line)
free(line);
-};
+}
-#endif // HAVE_ICU
+#endif /* HAVE_ICU */
int main(int argc, char **argv)
{
-#ifdef HAVE_ICU
+#if HAVE_ICU
read_params(argc, argv, &config);
if (config.print && strlen(config.print))
print_info(&config);
-#else // HAVE_ICU
+#else /* HAVE_ICU */
printf("ICU not available on your system.\n"
"Please install libicu36-dev and icu-doc or similar, "
"re-configure and re-compile\n");
-#endif // HAVE_ICU
+#endif /* HAVE_ICU */
return(0);
-};
+}
/*