1 /* This file is part of Metaproxy.
2 Copyright (C) 2005-2013 Index Data
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include "html_parser.hpp"
28 #define TAG_MAX_LEN 64
30 #define SPACECHR " \t\r\n\f"
35 #include <sys/types.h>
38 namespace mp = metaproxy_1;
40 mp::HTMLParser::HTMLParser()
44 mp::HTMLParser::~HTMLParser()
48 static void parse_str(mp::HTMLParserEvent & event, const char * str);
50 void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
52 parse_str(event, str);
55 //static C functions follow would probably make sense to wrap this in PIMPL?
57 static int skipSpace (const char *cp)
60 while (cp[i] && strchr (SPACECHR, cp[i]))
65 static int skipName (const char *cp, char *dst)
69 for (i=0; cp[i] && !strchr (SPACECHR "/>=", cp[i]); i++)
70 if (j < TAG_MAX_LEN-1)
72 dst[j] = tolower(cp[j]);
79 static int skipAttribute (const char *cp, char *name, char **value)
81 int i = skipName (cp, name);
84 return skipSpace (cp);
85 i += skipSpace (cp + i);
90 i += skipSpace (cp + i);
91 if (cp[i] == '\"' || cp[i] == '\'')
95 while (cp[i] != tr && cp[i])
104 while (cp[i] && !strchr (SPACECHR ">", cp[i]))
108 *value = (char *) malloc (v1 - v0 + 1);
109 memcpy (*value, cp + v0, v1-v0);
110 (*value)[v1-v0] = '\0';
112 i += skipSpace (cp + i);
116 static int tagAttrs (mp::HTMLParserEvent & event,
121 char attr_name[TAG_MAX_LEN];
124 while (cp[i] && cp[i] != '>')
126 int nor = skipAttribute (cp+i, attr_name, &attr_value);
130 DEBUG(printf ("------ attr %s=%s\n", attr_name, attr_value));
131 event.attribute(tagName, attr_name, attr_value);
142 static int tagStart (mp::HTMLParserEvent & event,
143 char *tagName, const char *cp, const char which)
146 i = skipName (cp, tagName);
150 DEBUG(printf ("------ tag close %s\n", tagName));
151 event.closeTag(tagName);
154 DEBUG(printf ("------ dtd %s\n", tagName));
157 DEBUG(printf ("------ pi %s\n", tagName));
160 DEBUG(printf ("------ tag open %s\n", tagName));
161 event.openTagStart(tagName);
167 static int tagEnd (mp::HTMLParserEvent & event, const char *tagName, const char *cp)
170 while (cp[i] && cp[i] != '>')
174 event.anyTagEnd(tagName);
180 static char* allocFromRange (const char *start, const char *end)
182 char *value = (char *) malloc (end - start + 1);
184 memcpy (value, start, end - start);
185 value[end - start] = '\0';
189 static void tagText (mp::HTMLParserEvent & event, const char *text_start, const char *text_end)
191 if (text_end - text_start) //got text to flush
193 char *temp = allocFromRange(text_start, text_end);
194 DEBUG(printf ("------ text %s\n", temp));
195 event.text(text_start, text_end-text_start);
200 static void parse_str (mp::HTMLParserEvent & event, const char *cp)
202 const char *text_start = cp;
203 const char *text_end = cp;
206 if (cp[0] == '<' && cp[1]) //tag?
209 if (which == '/') cp++;
210 if (!strchr (SPACECHR, cp[1])) //valid tag starts
212 tagText (event, text_start, text_end); //flush any text
213 char tagName[TAG_MAX_LEN];
217 cp += tagStart (event, tagName, cp, which);
219 else if (which == '!' || which == '?') //pi or dtd
222 cp += tagStart (event, tagName, cp, which);
226 cp += tagStart (event, tagName, cp, which);
227 cp += tagAttrs (event, tagName, cp);
229 cp += tagEnd (event, tagName, cp);
239 tagText (event, text_start, text_end); //flush any text
245 * c-file-style: "Stroustrup"
246 * indent-tabs-mode: nil
248 * vim: shiftwidth=4 tabstop=8 expandtab