1 /* This file is part of Metaproxy.
2 Copyright (C) 2005-2013 Index Data
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include "html_parser.hpp"
28 #define SPACECHR " \t\r\n\f"
31 namespace metaproxy_1 {
32 class HTMLParser::Rep {
33 friend class HTMLParser;
35 void parse_str(HTMLParserEvent &event, const char *cp);
36 void tagText(HTMLParserEvent &event,
37 const char *text_start, const char *text_end);
38 int tagEnd(HTMLParserEvent &event,
39 const char *tag, int tag_len, const char *cp);
40 int tagStart(HTMLParserEvent &event,
41 int *tag_len, const char *cp, const char which);
42 int tagAttrs(HTMLParserEvent &event,
43 const char *name, int len,
51 namespace mp = metaproxy_1;
53 mp::HTMLParser::Rep::Rep()
58 mp::HTMLParser::Rep::~Rep()
62 mp::HTMLParser::HTMLParser() : m_p(new Rep)
66 mp::HTMLParser::~HTMLParser()
70 void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
72 m_p->parse_str(event, str);
75 static int skipSpace(const char *cp)
78 while (cp[i] && strchr(SPACECHR, cp[i]))
83 static int skipName(const char *cp)
86 for (i = 0; cp[i] && !strchr(SPACECHR "/>=", cp[i]); i++)
91 static int skipAttribute(const char *cp, int *attr_len,
92 const char **value, int *val_len)
99 i += skipSpace(cp + i);
104 i += skipSpace(cp + i);
105 if (cp[i] == '\"' || cp[i] == '\'')
109 while (cp[i] != tr && cp[i])
118 while (cp[i] && !strchr(SPACECHR ">", cp[i]))
125 i += skipSpace(cp + i);
129 int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event,
130 const char *name, int len,
133 int i = skipSpace(cp);
134 while (cp[i] && cp[i] != '>' && cp[i] != '/')
136 const char *attr_name = cp + i;
140 int nor = skipAttribute(cp+i, &attr_len, &value, &val_len);
145 printf ("------ attr %.*s=%.*s\n", attr_len, attr_name,
147 event.attribute(name, len, attr_name, attr_len, value, val_len);
157 int mp::HTMLParser::Rep::tagStart(HTMLParserEvent &event,
159 const char *cp, const char which)
168 printf("------ tag close %.*s\n", i, cp);
169 event.closeTag(cp, i);
172 for (i = 0; cp[i] && cp[i] != '>'; i++)
175 event.openTagStart(cp, i);
177 printf("------ dtd %.*s\n", i, cp);
180 for (i = 0; cp[i] && cp[i] != '>'; i++)
183 event.openTagStart(cp, i);
185 printf("------ pi %.*s\n", i, cp);
191 printf("------ tag open %.*s\n", i, cp);
192 event.openTagStart(cp, i);
194 i += tagAttrs(event, cp, i, cp + i);
201 int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event,
202 const char *tag, int tag_len, const char *cp)
206 while (cp[i] && cp[i] != '>')
214 event.anyTagEnd(tag, tag_len, close_it);
220 void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event,
221 const char *text_start, const char *text_end)
223 if (text_end - text_start) //got text to flush
226 printf("------ text %.*s\n",
227 (int) (text_end - text_start), text_start);
228 event.text(text_start, text_end-text_start);
232 void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp)
234 const char *text_start = cp;
235 const char *text_end = cp;
238 if (cp[0] == '<' && cp[1]) //tag?
243 if (!strchr(SPACECHR, cp[1])) //valid tag starts
248 tagText(event, text_start, text_end); //flush any text
250 i += tagStart(event, &tag_len, cp, which);
251 i += tagEnd(event, cp, tag_len, cp + i);
262 tagText(event, text_start, text_end); //flush any text
268 * c-file-style: "Stroustrup"
269 * indent-tabs-mode: nil
271 * vim: shiftwidth=4 tabstop=8 expandtab