From: Heikki Levanto Date: Mon, 4 Nov 2013 14:31:42 +0000 (+0100) Subject: X-Metaproxy-SkipLink MP-479 X-Git-Tag: v1.4.2~5^2 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=1dca4a52e26e617c1f64a213ee7de780346a74f0;p=metaproxy-moved-to-github.git X-Metaproxy-SkipLink MP-479 Filter_http_rewritwe Understands and respects the header (which will normally come from the cproxy filter) --- diff --git a/src/filter_http_rewrite.cpp b/src/filter_http_rewrite.cpp index 5cab5de..d2cee8a 100644 --- a/src/filter_http_rewrite.cpp +++ b/src/filter_http_rewrite.cpp @@ -54,7 +54,8 @@ namespace metaproxy_1 { std::list replace_list; bool test_patterns( std::map &vars, - std::string &txt, bool anchor); + std::string &txt, bool anchor, + std::list &skip_list); }; class HttpRewrite::Within { public: @@ -65,7 +66,8 @@ namespace metaproxy_1 { bool reqline; RulePtr rule; bool exec(std::map &vars, - std::string &txt, bool anchor) const; + std::string &txt, bool anchor, + std::list &skip_list) const; }; class HttpRewrite::Content { @@ -76,15 +78,19 @@ namespace metaproxy_1 { void configure(const xmlNode *ptr, std::map &rules); void quoted_literal(std::string &content, - std::map &vars) const; + std::map &vars, + std::list & skip_list) const; void parse(int verbose, std::string &content, - std::map & vars) const; + std::map & vars, + std::list & skip_list ) const; }; class HttpRewrite::Phase { public: Phase(); int m_verbose; std::list content_list; + std::list skip_list; + void read_skip_headers(Z_HTTP_Request *hreq); void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq, std::map & vars) const; void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers, @@ -92,7 +98,8 @@ namespace metaproxy_1 { void rewrite_body(mp::odr & o, const char *content_type, char **content_buf, int *content_len, - std::map & vars) const; + std::map & vars, + std::list & skip_list ) const; }; class HttpRewrite::Event : public HTMLParserEvent { void openTagStart(const char *tag, int tag_len); @@ -107,8 +114,11 @@ namespace metaproxy_1 { WRBUF m_w; std::stack::const_iterator> s_within; std::map &m_vars; + std::list & m_skips; public: - Event(const Content *p, std::map &vars); + Event(const Content *p, + std::map &vars, + std::list & skip_list ); ~Event(); const char *result(); }; @@ -136,13 +146,16 @@ void yf::HttpRewrite::process(mp::Package & package) const Z_HTTP_Request *hreq = gdu->u.HTTP_Request; mp::odr o; req_phase->rewrite_reqline(o, hreq, vars); + res_phase->read_skip_headers(hreq); yaz_log(YLOG_LOG, ">> Request headers"); req_phase->rewrite_headers(o, hreq->headers, vars); req_phase->rewrite_body(o, z_HTTP_header_lookup(hreq->headers, "Content-Type"), &hreq->content_buf, &hreq->content_len, - vars); + vars, res_phase->skip_list); + // TODO skip_list does not really belong in the phase. More like + // HttpRewrite itself! package.request() = gdu; } package.move(); @@ -158,11 +171,49 @@ void yf::HttpRewrite::process(mp::Package & package) const z_HTTP_header_lookup(hres->headers, "Content-Type"), &hres->content_buf, &hres->content_len, - vars); + vars, res_phase->skip_list); package.response() = gdu; } } +// Read (and remove) the X-Metaproxy-SkipLink headers +void yf::HttpRewrite::Phase::read_skip_headers(Z_HTTP_Request *hreq) +{ + skip_list.clear(); + std::string url(hreq->path); + if ( url.substr(0,7) != "http://" ) + { // path was relative, as it usually is + const char *host = z_HTTP_header_lookup(hreq->headers, "Host"); + if (host) + url = "http://" + std::string(host) + hreq->path ; + } + + while ( const char *hv = z_HTTP_header_remove( &(hreq->headers), + "X-Metaproxy-SkipLink") ) + { + yaz_log(YLOG_LOG,"Found SkipLink '%s'", hv ); + const char *p = strchr(hv,' '); + if (!p) + continue; // should not happen + std::string page(hv,p); + std::string link(p+1); + boost::regex pagere(page); + if ( boost::regex_search(url, pagere) ) + { + yaz_log(YLOG_LOG,"SkipLink '%s' matches URL %s", + page.c_str(), url.c_str() ); + boost::regex linkre(link); + skip_list.push_back(linkre); + } + else + { + yaz_log(YLOG_LOG,"SkipLink ignored, '%s' does not match '%s'", + url.c_str(), page.c_str() ); + } + } +} + + void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o, Z_HTTP_Request *hreq, std::map & vars) const @@ -200,7 +251,8 @@ void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o, if (it->reqline) { yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str()); - if (it->exec(vars, path, true)) + std::list dummy_skip_list; // no skips here! + if (it->exec(vars, path, true, dummy_skip_list)) { yaz_log(YLOG_LOG, "Rewritten request URL is %s", path.c_str()); hreq->path = odr_strdup(o, path.c_str()); @@ -210,7 +262,7 @@ void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o, void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o, Z_HTTP_Header *headers, - std::map & vars) const + std::map & vars ) const { std::list::const_iterator cit = content_list.begin(); for (; cit != content_list.end(); cit++) @@ -230,7 +282,8 @@ void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o, { // Match and replace only the header value std::string hval(header->value); - if (it->exec(vars, hval, true)) + std::list dummy_skip_list; // no skips here! + if (it->exec(vars, hval, true, dummy_skip_list)) { header->value = odr_strdup(o, hval.c_str()); } @@ -244,7 +297,8 @@ void yf::HttpRewrite::Phase::rewrite_body( const char *content_type, char **content_buf, int *content_len, - std::map & vars) const + std::map & vars, + std::list & skip_list ) const { if (*content_len == 0) return; @@ -275,14 +329,15 @@ void yf::HttpRewrite::Phase::rewrite_body( } std::string content(*content_buf, *content_len); - cit->parse(m_verbose, content, vars); + cit->parse(m_verbose, content, vars, skip_list); *content_buf = odr_strdup(o, content.c_str()); *content_len = strlen(*content_buf); } yf::HttpRewrite::Event::Event(const Content *p, - std::map & vars - ) : m_content(p), m_vars(vars) + std::map & vars, + std::list & skip_list + ) : m_content(p), m_vars(vars), m_skips(skip_list) { m_w = wrbuf_alloc(); } @@ -367,7 +422,7 @@ void yf::HttpRewrite::Event::attribute(const char *tag, int tag_len, if (subst) { std::string s(value, val_len); - it->exec(m_vars, s, true); + it->exec(m_vars, s, true, m_skips); wrbuf_puts(m_w, s.c_str()); } else @@ -397,7 +452,7 @@ void yf::HttpRewrite::Event::text(const char *value, int len) if (it != m_content->within_list.end()) { std::string s(value, len); - it->exec(m_vars, s, false); + it->exec(m_vars, s, false, m_skips); wrbuf_puts(m_w, s.c_str()); } else @@ -408,7 +463,8 @@ static bool embed_quoted_literal( std::string &content, std::map &vars, mp::filter::HttpRewrite::RulePtr ruleptr, - bool html_context) + bool html_context, + std::list &skip_list) { bool replace = false; std::string res; @@ -432,7 +488,7 @@ static bool embed_quoted_literal( if (!*cp) break; std::string s(cp0, cp - cp0); - if (ruleptr->test_patterns(vars, s, true)) + if (ruleptr->test_patterns(vars, s, true, skip_list)) replace = true; cp0 = cp; res.append(s); @@ -454,7 +510,7 @@ static bool embed_quoted_literal( if (!*cp) break; std::string s(cp0, cp - cp0); - if (ruleptr->test_patterns(vars, s, true)) + if (ruleptr->test_patterns(vars, s, true, skip_list)) replace = true; cp0 = cp; res.append(s); @@ -473,21 +529,23 @@ static bool embed_quoted_literal( bool yf::HttpRewrite::Within::exec( std::map & vars, - std::string & txt, bool anchor) const + std::string & txt, bool anchor, + std::list & skip_list) const { if (type == "quoted-literal") { - return embed_quoted_literal(txt, vars, rule, true); + return embed_quoted_literal(txt, vars, rule, true, skip_list); } else { - return rule->test_patterns(vars, txt, anchor); + return rule->test_patterns(vars, txt, anchor, skip_list); } } bool yf::HttpRewrite::Rule::test_patterns( std::map & vars, - std::string & txt, bool anchor) + std::string & txt, bool anchor, + std::list & skip_list ) { bool replaces = false; bool first = anchor; @@ -534,12 +592,31 @@ bool yf::HttpRewrite::Rule::test_patterns( } } + // Compare against skip_list + bool skipthis = false; + std::list::iterator si = skip_list.begin(); + for ( ; si != skip_list.end(); si++) { + if ( boost::regex_search(bwhat.str(0), *si) ) + { + skipthis = true; + break; + } + } //prepare replacement string std::string rvalue = bit->sub_vars(vars); - yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'", - bwhat.str(0).c_str(), rvalue.c_str()); out.append(start, bwhat[0].first); - out.append(rvalue); + if ( skipthis ) + { + yaz_log(YLOG_LOG,"! Not rewriting '%s', skiplist match", + bwhat.str(0).c_str() ); + out.append(bwhat.str(0).c_str()); + } + else + { + yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'", + bwhat.str(0).c_str(), rvalue.c_str()); + out.append(rvalue); + } start = bwhat[0].second; //move search forward } out.append(start, end); @@ -668,12 +745,13 @@ yf::HttpRewrite::Phase::Phase() : m_verbose(0) void yf::HttpRewrite::Content::parse( int verbose, std::string &content, - std::map &vars) const + std::map &vars, + std::list & skip_list ) const { if (type == "html") { HTMLParser parser; - Event ev(this, vars); + Event ev(this, vars, skip_list); parser.set_verbose(verbose); @@ -682,17 +760,18 @@ void yf::HttpRewrite::Content::parse( } if (type == "quoted-literal") { - quoted_literal(content, vars); + quoted_literal(content, vars, skip_list); } } void yf::HttpRewrite::Content::quoted_literal( std::string &content, - std::map &vars) const + std::map &vars, + std::list & skip_list ) const { std::list::const_iterator it = within_list.begin(); if (it != within_list.end()) - embed_quoted_literal(content, vars, it->rule, false); + embed_quoted_literal(content, vars, it->rule, false, skip_list); } void yf::HttpRewrite::Content::configure( diff --git a/src/test_filter_rewrite.cpp b/src/test_filter_rewrite.cpp index 1f86a98..54b52f2 100644 --- a/src/test_filter_rewrite.cpp +++ b/src/test_filter_rewrite.cpp @@ -109,8 +109,11 @@ BOOST_AUTO_TEST_CASE( test_filter_rewrite_1 ) mp::odr odr; Z_GDU *gdu_req = z_get_HTTP_Request_uri(odr, - "http://proxyhost/proxypath/targetsite/page1.html", 0, 1); - + "http://proxyhost/proxypath/targetsite/page1.html", 0, 1); + + Z_HTTP_Request *hreq = gdu_req->u.HTTP_Request; + z_HTTP_header_set(odr, &hreq->headers, + "X-Metaproxy-SkipLink", ".* skiplink.com" ); pack.request() = gdu_req; //create the http response @@ -141,12 +144,13 @@ BOOST_AUTO_TEST_CASE( test_filter_rewrite_1 ) " Another abs link" "" "" - "" + "\n" + "skip\n" ""; const char *resp_expected = "HTTP/1.1 200 OK\r\n" - "Content-Length: 631\r\n" + "Content-Length: 686\r\n" "Content-Type: text/html\r\n" "Link: ; rel=absolute\r\n" "Link: ; rel=relative\r\n" @@ -171,7 +175,8 @@ BOOST_AUTO_TEST_CASE( test_filter_rewrite_1 ) " Another abs link" "" "" - "" + "\n" + "skip\n" ""; Z_GDU *gdu_res;