std::map<std::string, std::string> & vars) const;
};
class HttpRewrite::Event : public HTMLParserEvent {
- void openTagStart(const char *name);
- void anyTagEnd(const char *name, int close_it);
- void attribute(const char *tagName,
- const char *name,
- const char *value,
- int val_len);
- void closeTag(const char *name);
+ void openTagStart(const char *tag, int tag_len);
+ void anyTagEnd(const char *tag, int tag_len, int close_it);
+ void attribute(const char *tag, int tag_len,
+ const char *attr, int attr_len,
+ const char *value, int val_len);
+ void closeTag(const char *tag, int tag_len);
void text(const char *value, int len);
const Phase *m_phase;
WRBUF m_w;
return wrbuf_cstr(m_w);
}
-void yf::HttpRewrite::Event::openTagStart(const char *name)
+void yf::HttpRewrite::Event::openTagStart(const char *tag, int tag_len)
{
// check if there is <within tag="x" .. />
if (enabled_within == m_phase->within_list.end())
{
+ std::string t(tag, tag_len);
std::list<Within>::const_iterator it =
m_phase->within_list.begin();
for (; it != m_phase->within_list.end(); it++)
{
- if (it->tag.length() > 0 && it->tag.compare(name) == 0)
+ if (it->tag.length() > 0 && yaz_strcasecmp(it->tag.c_str(),
+ t.c_str()) == 0)
{
enabled_within = it;
}
}
}
wrbuf_putc(m_w, '<');
- wrbuf_puts(m_w, name);
+ wrbuf_write(m_w, tag, tag_len);
}
-void yf::HttpRewrite::Event::anyTagEnd(const char *name, int close_it)
+void yf::HttpRewrite::Event::anyTagEnd(const char *tag, int tag_len,
+ int close_it)
{
if (close_it)
{
std::list<Within>::const_iterator it = enabled_within;
if (it != m_phase->within_list.end())
{
- if (it->tag.compare(name) == 0)
+ std::string t(tag, tag_len);
+ if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
{
enabled_within = m_phase->within_list.end();
}
wrbuf_putc(m_w, '>');
}
-void yf::HttpRewrite::Event::attribute(const char *tagName,
- const char *name,
- const char *value,
- int val_len)
+void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len,
+ const char *attr, int attr_len,
+ const char *value, int val_len)
{
std::list<Within>::const_iterator it = m_phase->within_list.begin();
bool subst = false;
for (; it != m_phase->within_list.end(); it++)
{
- if (it->tag.length() == 0 || it->tag.compare(tagName) == 0)
+ std::string t(tag, tag_len);
+ if (it->tag.length() == 0 ||
+ yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
{
+ std::string a(attr, attr_len);
std::vector<std::string> attr;
boost::split(attr, it->attr, boost::is_any_of(","));
size_t i;
for (i = 0; i < attr.size(); i++)
{
- if (attr[i].compare("#text") && attr[i].compare(name) == 0)
+ if (attr[i].compare("#text") &&
+ yaz_strcasecmp(attr[i].c_str(), a.c_str()) == 0)
subst = true;
}
}
}
wrbuf_putc(m_w, ' ');
- wrbuf_puts(m_w, name);
+ wrbuf_write(m_w, attr, attr_len);
wrbuf_puts(m_w, "=\"");
std::string output;
wrbuf_puts(m_w, "\"");
}
-void yf::HttpRewrite::Event::closeTag(const char *name)
+void yf::HttpRewrite::Event::closeTag(const char *tag, int tag_len)
{
std::list<Within>::const_iterator it = enabled_within;
if (it != m_phase->within_list.end())
{
- if (it->tag.compare(name) == 0)
+ std::string t(tag, tag_len);
+ if (yaz_strcasecmp(it->tag.c_str(), t.c_str()) == 0)
{
enabled_within = m_phase->within_list.end();
}
}
wrbuf_puts(m_w, "</");
- wrbuf_puts(m_w, name);
+ wrbuf_write(m_w, tag, tag_len);
}
void yf::HttpRewrite::Event::text(const char *value, int len)
#include <ctype.h>
#include <stdio.h>
-#define TAG_MAX_LEN 64
-
#define SPACECHR " \t\r\n\f"
-#define DEBUG(x) x
-#if HAVE_SYS_TYPES_H
-#include <sys/types.h>
-#endif
+namespace metaproxy_1 {
+ class HTMLParser::Rep {
+ friend class HTMLParser;
+ public:
+ void parse_str(HTMLParserEvent &event, const char *cp);
+ void tagText(HTMLParserEvent &event,
+ const char *text_start, const char *text_end);
+ int tagEnd(HTMLParserEvent &event,
+ const char *tag, int tag_len, const char *cp);
+ int tagStart(HTMLParserEvent &event,
+ int *tag_len, const char *cp, const char which);
+ int tagAttrs(HTMLParserEvent &event,
+ const char *name, int len,
+ const char *cp);
+ Rep();
+ ~Rep();
+ int m_verbose;
+ };
+}
namespace mp = metaproxy_1;
-mp::HTMLParser::HTMLParser()
+mp::HTMLParser::Rep::Rep()
{
+ m_verbose = 0;
}
-mp::HTMLParser::~HTMLParser()
+mp::HTMLParser::Rep::~Rep()
+{
+}
+
+mp::HTMLParser::HTMLParser() : m_p(new Rep)
{
}
-static void parse_str(mp::HTMLParserEvent & event, const char * str);
+mp::HTMLParser::~HTMLParser()
+{
+}
void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
{
- parse_str(event, str);
+ m_p->parse_str(event, str);
}
static int skipSpace(const char *cp)
return i;
}
-static int skipName(const char *cp, char *dst)
+static int skipName(const char *cp)
{
int i;
- int j = 0;
for (i = 0; cp[i] && !strchr(SPACECHR "/>=", cp[i]); i++)
- if (j < TAG_MAX_LEN-1)
- {
- dst[j] = tolower(cp[j]);
- j++;
- }
- dst[j] = '\0';
+ ;
return i;
}
-static int skipAttribute(const char *cp, char *name, const char **value, int *val_len)
+static int skipAttribute(const char *cp, int *attr_len,
+ const char **value, int *val_len)
{
- int i = skipName(cp, name);
+ int i = skipName(cp);
+ *attr_len = i;
*value = NULL;
if (!i)
return skipSpace(cp);
return i;
}
-static int tagAttrs(mp::HTMLParserEvent & event,
- const char *tagName,
- const char *cp)
+int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event,
+ const char *name, int len,
+ const char *cp)
{
- char attr_name[TAG_MAX_LEN];
- const char *attr_value;
- int val_len;
int i = skipSpace(cp);
while (cp[i] && cp[i] != '>' && cp[i] != '/')
{
- int nor = skipAttribute(cp+i, attr_name, &attr_value, &val_len);
+ const char *attr_name = cp + i;
+ int attr_len;
+ const char *value;
+ int val_len;
+ int nor = skipAttribute(cp+i, &attr_len, &value, &val_len);
i += nor;
if (nor)
{
- DEBUG(printf ("------ attr %s=%.*s\n", attr_name, val_len, attr_value));
- event.attribute(tagName, attr_name, attr_value, val_len);
+ if (m_verbose)
+ printf ("------ attr %.*s=%.*s\n", attr_len, attr_name,
+ val_len, value);
+ event.attribute(name, len, attr_name, attr_len, value, val_len);
}
else
{
- if (!nor)
- i++;
+ i++;
}
}
return i;
}
-static int tagStart(mp::HTMLParserEvent & event,
- char *tagName, const char *cp, const char which)
+int mp::HTMLParser::Rep::tagStart(HTMLParserEvent &event,
+ int *tag_len,
+ const char *cp, const char which)
{
- int i = skipName(cp, tagName);
+ int i;
switch (which)
{
- case '/' :
- DEBUG(printf("------ tag close %s\n", tagName));
- event.closeTag(tagName);
+ case '/':
+ i = skipName(cp);
+ *tag_len = i;
+ if (m_verbose)
+ printf("------ tag close %.*s\n", i, cp);
+ event.closeTag(cp, i);
break;
- case '!' :
- DEBUG(printf("------ dtd %s\n", tagName));
+ case '!':
+ for (i = 0; cp[i] && cp[i] != '>'; i++)
+ ;
+ *tag_len = i;
+ event.openTagStart(cp, i);
+ if (m_verbose)
+ printf("------ dtd %.*s\n", i, cp);
break;
- case '?' :
- DEBUG(printf("------ pi %s\n", tagName));
+ case '?':
+ for (i = 0; cp[i] && cp[i] != '>'; i++)
+ ;
+ *tag_len = i;
+ event.openTagStart(cp, i);
+ if (m_verbose)
+ printf("------ pi %.*s\n", i, cp);
break;
- default :
- DEBUG(printf("------ tag open %s\n", tagName));
- event.openTagStart(tagName);
+ default:
+ i = skipName(cp);
+ *tag_len = i;
+ if (m_verbose)
+ printf("------ tag open %.*s\n", i, cp);
+ event.openTagStart(cp, i);
+
+ i += tagAttrs(event, cp, i, cp + i);
+
break;
}
return i;
}
-static int tagEnd(mp::HTMLParserEvent & event, const char *tagName, const char *cp)
+int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event,
+ const char *tag, int tag_len, const char *cp)
{
int i = 0;
int close_it = 0;
}
if (cp[i] == '>')
{
- event.anyTagEnd(tagName, close_it);
+ event.anyTagEnd(tag, tag_len, close_it);
i++;
}
return i;
}
-static void tagText(mp::HTMLParserEvent & event, const char *text_start, const char *text_end)
+void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event,
+ const char *text_start, const char *text_end)
{
if (text_end - text_start) //got text to flush
{
- DEBUG(printf("------ text %.*s\n",
- (int) (text_end - text_start), text_start));
+ if (m_verbose)
+ printf("------ text %.*s\n",
+ (int) (text_end - text_start), text_start);
event.text(text_start, text_end-text_start);
}
}
-static void parse_str(mp::HTMLParserEvent & event, const char *cp)
+void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp)
{
const char *text_start = cp;
const char *text_end = cp;
cp++;
if (!strchr(SPACECHR, cp[1])) //valid tag starts
{
+ int i = 0;
+ int tag_len;
+
tagText(event, text_start, text_end); //flush any text
- char tagName[TAG_MAX_LEN];
cp++;
- if (which == '/')
- {
- cp += tagStart(event, tagName, cp, which);
- }
- else if (which == '!' || which == '?') //pi or dtd
- {
- cp++;
- cp += tagStart(event, tagName, cp, which);
- }
- else
- {
- cp += tagStart(event, tagName, cp, which);
- cp += tagAttrs(event, tagName, cp);
- }
- cp += tagEnd(event, tagName, cp);
+ i += tagStart(event, &tag_len, cp, which);
+ i += tagEnd(event, cp, tag_len, cp + i);
+ cp += i;
text_start = cp;
text_end = cp;
continue;
namespace metaproxy_1 {
class HTMLParserEvent {
public:
- virtual void openTagStart(const char *name) = 0;
- virtual void anyTagEnd(const char *name, int close_it) = 0;
- virtual void attribute(const char *tagName, const char *name,
- const char *value,
- int val_len) = 0;
- virtual void closeTag(const char *name) = 0;
+ virtual void openTagStart(const char *tag, int tag_len) = 0;
+ virtual void anyTagEnd(const char *tag, int tag_len,
+ int close_it) = 0;
+ virtual void attribute(const char *tag, int tag_len,
+ const char *attr, int attr_len,
+ const char *value, int val_len) = 0;
+ virtual void closeTag(const char *tag, int tag_len) = 0;
virtual void text(const char *value, int len) = 0;
};
class HTMLParser {
+ class Rep;
public:
HTMLParser();
~HTMLParser();
void parse(HTMLParserEvent &event, const char *str) const;
+ private:
+ boost::scoped_ptr<Rep> m_p;
};
}
using namespace boost::unit_test;
namespace mp = metaproxy_1;
-class MyEvent : public mp::HTMLParserEvent {
- public:
- std::string out;
- void openTagStart(const char *name)
- {
- out += "<";
- out += name;
- }
-
- void attribute(const char *tagName,
- const char *name, const char *value, int val_len)
- {
- out += " ";
- out += name;
- out += "=\"";
- out.append(value, val_len);
- out += "\"";
- }
-
- void anyTagEnd(const char *name, int close_it)
- {
- if (close_it)
- out += "/";
- out += ">";
- }
-
- void closeTag(const char *name)
- {
- out += "</";
- out += name;
- }
-
- void text(const char *value, int len)
- {
- out.append(value, len);
- }
+class MyEvent : public mp::HTMLParserEvent
+{
+public:
+ std::string out;
+ void openTagStart(const char *tag, int tag_len) {
+ out += "<";
+ out.append(tag, tag_len);
+ }
+
+ void attribute(const char *tag, int tag_len,
+ const char *attr, int attr_len,
+ const char *value, int val_len) {
+ out += " ";
+ out.append(attr, attr_len);
+ out += "=\"";
+ out.append(value, val_len);
+ out += "\"";
+ }
+ void anyTagEnd(const char *tag, int tag_len, int close_it) {
+ if (close_it)
+ out += "/";
+ out += ">";
+ }
+ void closeTag(const char *tag, int tag_len) {
+ out += "</";
+ out.append(tag, tag_len);
+ }
+ void text(const char *value, int len) {
+ out.append(value, len);
+ }
};
MyEvent e;
hp.parse(e, html);
+ std::cout << "Expected" << std::endl;
+ std::cout << expected << std::endl;
+ std::cout << "Got" << std::endl;
+ std::cout << e.out << std::endl;
+ BOOST_CHECK_EQUAL(std::string(expected), e.out);
+ }
+ catch (std::exception & e)
+ {
+ std::cout << e.what();
+ std::cout << std::endl;
+ BOOST_CHECK (false);
+ }
+}
+
+BOOST_AUTO_TEST_CASE( test_html_parser_2 )
+{
+ try
+ {
+ mp::HTMLParser hp;
+ const char* html =
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n"
+ "<HTML>\n"
+ " <HEAD>\n"
+ " <TITLE>YAZ 4.2.60</TITLE>\n"
+ " </HEAD>\n"
+ " <BODY>\n"
+ " <P><A HREF=\"http://www.indexdata.com/yaz/\">YAZ</A> 4.2.60</P>\n"
+ " <P>Error: 404</P>\n"
+ " <P>Description: Not Found</P>\n"
+ " </BODY>\n"
+ "</HTML>";
+
+ const char* expected = html;
+ MyEvent e;
+ hp.parse(e, html);
+
+ std::cout << "Expected" << std::endl;
std::cout << expected << std::endl;
+ std::cout << "Got" << std::endl;
std::cout << e.out << std::endl;
+
BOOST_CHECK_EQUAL(std::string(expected), e.out);
}
catch (std::exception & e)