+/* ICU sort keys seem to be of the form
+ basechars \x01 accents \x01 length
+ For now we'll just right truncate from basechars . This
+ may give false hits due to accents not being used.
+*/
+static size_t icu_basechars(const char *buf, size_t i)
+{
+ while (i > 0 && buf[--i] != '\x01') /* skip length */
+ ;
+ while (i > 0 && buf[--i] != '\x01') /* skip accents */
+ ;
+ return i; /* only basechars left */
+}
+
+static int term_102_icu(zebra_map_t zm,
+ const char **src, WRBUF term_dict, int space_split,
+ WRBUF display_term)
+{
+ int no_terms = 0;
+ const char *s0 = *src, *s1;
+ while (*s0 == ' ')
+ s0++;
+ s1 = s0;
+ for (;;)
+ {
+ if (*s1 == ' ' && space_split)
+ break;
+ else if (*s1 && !strchr(REGEX_CHARS "-", *s1))
+ s1++;
+ else
+ {
+ /* EOF or regex reserved char */
+ if (s0 != s1)
+ {
+ const char *res_buf = 0;
+ size_t res_len = 0;
+ const char *display_buf;
+ size_t display_len;
+
+ zebra_map_tokenize_start(zm, s0, s1 - s0);
+
+ if (zebra_map_tokenize_next(zm, &res_buf, &res_len,
+ &display_buf, &display_len))
+ {
+ size_t i;
+ res_len = icu_basechars(res_buf, res_len);
+ for (i = 0; i < res_len; i++)
+ {
+ if (strchr(REGEX_CHARS "\\", res_buf[i]))
+ wrbuf_putc(term_dict, '\\');
+ if (res_buf[i] < 32)
+ wrbuf_putc(term_dict, '\x01');
+
+ wrbuf_putc(term_dict, res_buf[i]);
+ }
+ wrbuf_write(display_term, display_buf, display_len);
+
+ no_terms++;
+ }
+ }
+ if (*s1 == '\0')
+ break;
+
+ wrbuf_putc(term_dict, *s1);
+ wrbuf_putc(display_term, *s1);
+
+ s1++;
+ s0 = s1;
+ }
+ }
+ if (no_terms)
+ wrbuf_puts(term_dict, "\x01\x01.*");
+ *src = s1;
+ return no_terms;
+}
+