1 /* This file is part of Metaproxy.
2 Copyright (C) Index Data
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include "html_parser.hpp"
27 #include <yaz/matchstr.h>
29 #define SPACECHR " \t\r\n\f"
31 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
33 namespace metaproxy_1 {
34 class HTMLParser::Rep {
35 friend class HTMLParser;
37 void parse_str(HTMLParserEvent &event, const char *cp);
38 void tagText(HTMLParserEvent &event,
39 const char *text_start, const char *text_end);
40 int tagEnd(HTMLParserEvent &event,
41 const char *tag, int tag_len, const char *cp);
42 int tagAttrs(HTMLParserEvent &event,
43 const char *name, int len,
45 int skipAttribute(HTMLParserEvent &event,
46 const char *cp, int *attr_len,
47 const char **value, int *val_len, int *tr);
55 namespace mp = metaproxy_1;
57 mp::HTMLParser::Rep::Rep()
63 mp::HTMLParser::Rep::~Rep()
67 mp::HTMLParser::HTMLParser() : m_p(new Rep)
71 mp::HTMLParser::~HTMLParser()
75 void mp::HTMLParser::set_verbose(int v)
81 void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
83 m_p->parse_str(event, str);
86 static int isAlpha(int c)
88 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
91 static int skipSpace(const char *cp)
94 while (cp[i] && strchr(SPACECHR, cp[i]))
99 static int skipName(const char *cp)
102 for (i = 0; cp[i] && !strchr(SPACECHR "/><=", cp[i]); i++)
107 int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event,
108 const char *cp, int *attr_len,
109 const char **value, int *val_len,
113 int i = skipName(cp);
117 return skipSpace(cp);
118 i += skipSpace(cp + i);
122 i += skipSpace(cp + i);
123 if (cp[i] == '\"' || cp[i] == '\'')
127 while (cp[i] != *tr && cp[i])
137 while (cp[i] && !strchr(SPACECHR ">", cp[i]))
143 i += skipSpace(cp + i);
148 int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event,
149 const char *name, int len,
152 int i = skipSpace(cp);
153 while (cp[i] && !strchr("/><", cp[i]))
155 const char *attr_name = cp + i;
161 int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr);
170 printf("------ attr %.*s", attr_len, attr_name);
172 printf("=%.*s", val_len, value);
175 event.attribute(name, len, attr_name, attr_len, value, val_len, x);
180 int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event,
181 const char *tag, int tag_len, const char *cp)
185 for (; cp[i] && !strchr("/><", cp[i]); i++)
190 printf("------ text %.*s\n", i, cp);
201 printf("------ any tag %s %.*s\n",
202 close_it ? "close" : "end", tag_len, tag);
203 event.anyTagEnd(tag, tag_len, close_it);
209 void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event,
210 const char *text_start, const char *text_end)
212 if (text_end - text_start) //got text to flush
215 printf("------ text %.*s\n",
216 (int) (text_end - text_start), text_start);
217 event.text(text_start, text_end-text_start);
221 void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp)
223 const char *text_start = cp;
229 if (nest && *cp == '!')
232 tagText(event, text_start, cp - 1);
233 if (cp[1] == '-' && cp[2] == '-')
235 for (i = 3; cp[i]; i++)
236 if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>')
239 event.openTagStart(cp, i);
245 for (i = 1; cp[i] && cp[i] != '>'; i++)
247 event.openTagStart(cp, i);
250 printf("------ dtd %.*s\n", i, cp);
251 i += tagEnd(event, cp, i, cp + i);
255 else if (nest && *cp == '?')
258 tagText(event, text_start, cp - 1);
259 for (i = 1; cp[i] && cp[i] != '>'; i++)
261 event.openTagStart(cp, i);
263 printf("------ pi %.*s\n", i, cp);
264 i += tagEnd(event, cp, i, cp + i);
268 else if (*cp == '/' && isAlpha(cp[1]))
276 if (i == 6 && !yaz_strncasecmp(cp, "script", i))
278 int ws = skipSpace(cp + 6);
279 if (cp[ws + 6] == '>')
280 nest = true; /* really terminated */
285 tagText(event, text_start, cp - 2);
286 event.closeTag(cp, i);
288 printf("------ tag close %.*s\n", i, cp);
289 i += tagEnd(event, cp, i, cp + i);
293 else if (nest && isAlpha(*cp))
296 tagText(event, text_start, cp - 1);
298 event.openTagStart(cp, i);
300 printf("------ tag open %.*s\n", i, cp);
301 j = tagAttrs(event, cp, i, cp + i);
302 j += tagEnd(event, cp, i, cp + i + j);
304 if (i == 6 && !yaz_strncasecmp(cp, "script", i))
311 tagText(event, text_start, cp);
314 mp::HTMLParserEvent::~HTMLParserEvent()
321 * c-file-style: "Stroustrup"
322 * indent-tabs-mode: nil
324 * vim: shiftwidth=4 tabstop=8 expandtab