+#define REGEX_CHARS " ^[]()|.*+?!\"$\\"
+
+static void add_non_space(const char *start, const char *end,
+ WRBUF term_dict,
+ WRBUF display_term,
+ const char **map, int q_map_match)
+{
+ size_t sz = end - start;
+
+ wrbuf_write(display_term, start, sz);
+ if (!q_map_match)
+ {
+ while (start < end)
+ {
+ if (strchr(REGEX_CHARS, *start))
+ wrbuf_putc(term_dict, '\\');
+ wrbuf_putc(term_dict, *start);
+ start++;
+ }
+ }
+ else
+ {
+ char tmpbuf[80];
+ esc_str(tmpbuf, sizeof(tmpbuf), map[0], strlen(map[0]));
+
+ wrbuf_puts(term_dict, map[0]);
+ }
+}
+
+
+/* ICU sort keys seem to be of the form
+ basechars \x01 accents \x01 length
+ For now we'll just right truncate from basechars . This
+ may give false hits due to accents not being used.
+*/
+static size_t icu_basechars(const char *buf, size_t i)
+{
+ while (i > 0 && buf[--i] != '\x01') /* skip length */
+ ;
+ while (i > 0 && buf[--i] != '\x01') /* skip accents */
+ ;
+ return i; /* only basechars left */
+}
+
+static int term_102_icu(zebra_map_t zm,
+ const char **src, WRBUF term_dict, int space_split,
+ WRBUF display_term)
+{
+ int no_terms = 0;
+ const char *s0 = *src, *s1;
+ while (*s0 == ' ')
+ s0++;
+ s1 = s0;
+ for (;;)
+ {
+ if (*s1 == ' ' && space_split)
+ break;
+ else if (*s1 && !strchr(REGEX_CHARS "-", *s1))
+ s1++;
+ else
+ {
+ /* EOF or regex reserved char */
+ if (s0 != s1)
+ {
+ const char *res_buf = 0;
+ size_t res_len = 0;
+ const char *display_buf;
+ size_t display_len;
+
+ zebra_map_tokenize_start(zm, s0, s1 - s0);
+
+ if (zebra_map_tokenize_next(zm, &res_buf, &res_len,
+ &display_buf, &display_len))
+ {
+ size_t i;
+ res_len = icu_basechars(res_buf, res_len);
+ for (i = 0; i < res_len; i++)
+ {
+ if (strchr(REGEX_CHARS "\\", res_buf[i]))
+ wrbuf_putc(term_dict, '\\');
+ if (res_buf[i] < 32)
+ wrbuf_putc(term_dict, '\x01');
+
+ wrbuf_putc(term_dict, res_buf[i]);
+ }
+ wrbuf_write(display_term, display_buf, display_len);
+
+ no_terms++;
+ }
+ }
+ if (*s1 == '\0')
+ break;
+
+ wrbuf_putc(term_dict, *s1);
+ wrbuf_putc(display_term, *s1);
+
+ s1++;
+ s0 = s1;
+ }
+ }
+ if (no_terms)
+ wrbuf_puts(term_dict, "\x01\x01.*");
+ *src = s1;
+ return no_terms;
+}
+
+static int term_100_icu(zebra_map_t zm,
+ const char **src, WRBUF term_dict,
+ WRBUF display_term,
+ int mode,
+ size_t token_number)
+{
+ size_t i;
+ const char *res_buf = 0;
+ size_t res_len = 0;
+ const char *display_buf;
+ size_t display_len;
+
+ zebra_map_tokenize_start(zm, *src, strlen(*src));
+ for (i = 0; i <= token_number; i++)
+ {
+ if (!zebra_map_tokenize_next(zm, &res_buf, &res_len,
+ &display_buf, &display_len))
+ return 0;
+ }
+ wrbuf_write(display_term, display_buf, display_len);
+ if (mode)
+ {
+ res_len = icu_basechars(res_buf, res_len);
+ }
+ if (mode & 2)
+ wrbuf_puts(term_dict, ".*");
+ for (i = 0; i < res_len; i++)
+ {
+ if (strchr(REGEX_CHARS "\\", res_buf[i]))
+ wrbuf_putc(term_dict, '\\');
+ if (res_buf[i] < 32)
+ wrbuf_putc(term_dict, '\x01');
+
+ wrbuf_putc(term_dict, res_buf[i]);
+ }
+ if (mode & 1)
+ wrbuf_puts(term_dict, ".*");
+ else if (mode)
+ wrbuf_puts(term_dict, "\x01\x01.*");
+ return 1;
+}