From: Adam Dickmeiss Date: Fri, 26 Oct 2001 13:26:11 +0000 (+0000) Subject: Implemented Allow/deny rules. Better Tcl autoconfig. X-Git-Tag: ZMBOT.0.1~17 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=ca0419e13f1efe7c3d56d59766f48a826cd6c080;p=tclrobot.git Implemented Allow/deny rules. Better Tcl autoconfig. --- diff --git a/README b/README index e7b1184..53f68dd 100644 --- a/README +++ b/README @@ -41,6 +41,9 @@ Introduction quote the domain when given on the command line so that your shell doesn't expand this. This option may be repeated thus allowing you to specify many "allowed" domains. + + -r rules Specifies a file with rules. See the rules file for an + example. Example 1: Harvest three links away from www.somwhere.com world-wide: ./robot.tcl -c 3 http://www.somwhere.com/ diff --git a/configure b/configure index 6d5b3e8..f5a533f 100755 --- a/configure +++ b/configure @@ -12,7 +12,7 @@ ac_help= ac_default_prefix=/usr/local # Any additions from configure.in: ac_help="$ac_help - --with-tclconfig Path for tclConfig.sh/tkConfig.sh" + --with-tclconfig Path for tclConfig.sh" # Initialize some variables set by options. # The variables have the same names as the options, with @@ -936,12 +936,20 @@ fi tryprefix=${prefix} prefix=${saveprefix} fi -if test -r ${tclconfig}/tclConfig.sh; then - echo $ac_n "checking for Tcl""... $ac_c" 1>&6 -echo "configure:942: checking for Tcl" >&5 - . ${tclconfig}/tclConfig.sh +echo $ac_n "checking for Tcl""... $ac_c" 1>&6 +echo "configure:941: checking for Tcl" >&5 +if test -d ${tclconfig}; then + tclconfig=${tclconfig}/tclConfig.sh +fi +if test -r ${tclconfig}; then + . ${tclconfig} TCLLIB="${TCL_LIB_SPEC} ${TCL_LIBS}" - TCLINC=-I${TCL_PREFIX}/include + + if test -d ${TCL_PREFIX}/include/tcl${TCL_VERSION}; then + TCLINC=-I${TCL_PREFIX}/include/tcl${TCL_VERSION} + else + TCLINC=-I${TCL_PREFIX}/include + fi RANLIB=$TCL_RANLIB SHLIB_CFLAGS=$TCL_SHLIB_CFLAGS SHLIB_LD=$TCL_SHLIB_LD @@ -950,284 +958,7 @@ echo "configure:942: checking for Tcl" >&5 echo "$ac_t""$TCL_VERSION" 1>&6 CC=$TCL_CC else - # Extract the first word of "gcc", so it can be a program name with args. -set dummy gcc; ac_word=$2 -echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:957: checking for $ac_word" >&5 -if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then - echo $ac_n "(cached) $ac_c" 1>&6 -else - if test -n "$CC"; then - ac_cv_prog_CC="$CC" # Let the user override the test. -else - IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" - ac_dummy="$PATH" - for ac_dir in $ac_dummy; do - test -z "$ac_dir" && ac_dir=. - if test -f $ac_dir/$ac_word; then - ac_cv_prog_CC="gcc" - break - fi - done - IFS="$ac_save_ifs" -fi -fi -CC="$ac_cv_prog_CC" -if test -n "$CC"; then - echo "$ac_t""$CC" 1>&6 -else - echo "$ac_t""no" 1>&6 -fi - -if test -z "$CC"; then - # Extract the first word of "cc", so it can be a program name with args. -set dummy cc; ac_word=$2 -echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:987: checking for $ac_word" >&5 -if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then - echo $ac_n "(cached) $ac_c" 1>&6 -else - if test -n "$CC"; then - ac_cv_prog_CC="$CC" # Let the user override the test. -else - IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" - ac_prog_rejected=no - ac_dummy="$PATH" - for ac_dir in $ac_dummy; do - test -z "$ac_dir" && ac_dir=. - if test -f $ac_dir/$ac_word; then - if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then - ac_prog_rejected=yes - continue - fi - ac_cv_prog_CC="cc" - break - fi - done - IFS="$ac_save_ifs" -if test $ac_prog_rejected = yes; then - # We found a bogon in the path, so make sure we never use it. - set dummy $ac_cv_prog_CC - shift - if test $# -gt 0; then - # We chose a different compiler from the bogus one. - # However, it has the same basename, so the bogon will be chosen - # first if we set CC to just the basename; use the full file name. - shift - set dummy "$ac_dir/$ac_word" "$@" - shift - ac_cv_prog_CC="$@" - fi -fi -fi -fi -CC="$ac_cv_prog_CC" -if test -n "$CC"; then - echo "$ac_t""$CC" 1>&6 -else - echo "$ac_t""no" 1>&6 -fi - - if test -z "$CC"; then - case "`uname -s`" in - *win32* | *WIN32*) - # Extract the first word of "cl", so it can be a program name with args. -set dummy cl; ac_word=$2 -echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:1038: checking for $ac_word" >&5 -if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then - echo $ac_n "(cached) $ac_c" 1>&6 -else - if test -n "$CC"; then - ac_cv_prog_CC="$CC" # Let the user override the test. -else - IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" - ac_dummy="$PATH" - for ac_dir in $ac_dummy; do - test -z "$ac_dir" && ac_dir=. - if test -f $ac_dir/$ac_word; then - ac_cv_prog_CC="cl" - break - fi - done - IFS="$ac_save_ifs" -fi -fi -CC="$ac_cv_prog_CC" -if test -n "$CC"; then - echo "$ac_t""$CC" 1>&6 -else - echo "$ac_t""no" 1>&6 -fi - ;; - esac - fi - test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; } -fi - -echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6 -echo "configure:1070: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5 - -ac_ext=c -# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. -ac_cpp='$CPP $CPPFLAGS' -ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' -ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' -cross_compiling=$ac_cv_prog_cc_cross - -cat > conftest.$ac_ext << EOF - -#line 1081 "configure" -#include "confdefs.h" - -main(){return(0);} -EOF -if { (eval echo configure:1086: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then - ac_cv_prog_cc_works=yes - # If we can't run a trivial program, we are probably using a cross compiler. - if (./conftest; exit) 2>/dev/null; then - ac_cv_prog_cc_cross=no - else - ac_cv_prog_cc_cross=yes - fi -else - echo "configure: failed program was:" >&5 - cat conftest.$ac_ext >&5 - ac_cv_prog_cc_works=no -fi -rm -fr conftest* -ac_ext=c -# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. -ac_cpp='$CPP $CPPFLAGS' -ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' -ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' -cross_compiling=$ac_cv_prog_cc_cross - -echo "$ac_t""$ac_cv_prog_cc_works" 1>&6 -if test $ac_cv_prog_cc_works = no; then - { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; } -fi -echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6 -echo "configure:1112: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5 -echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6 -cross_compiling=$ac_cv_prog_cc_cross - -echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6 -echo "configure:1117: checking whether we are using GNU C" >&5 -if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then - echo $ac_n "(cached) $ac_c" 1>&6 -else - cat > conftest.c <&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then - ac_cv_prog_gcc=yes -else - ac_cv_prog_gcc=no -fi -fi - -echo "$ac_t""$ac_cv_prog_gcc" 1>&6 - -if test $ac_cv_prog_gcc = yes; then - GCC=yes -else - GCC= -fi - -ac_test_CFLAGS="${CFLAGS+set}" -ac_save_CFLAGS="$CFLAGS" -CFLAGS= -echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6 -echo "configure:1145: checking whether ${CC-cc} accepts -g" >&5 -if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then - echo $ac_n "(cached) $ac_c" 1>&6 -else - echo 'void f(){}' > conftest.c -if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then - ac_cv_prog_cc_g=yes -else - ac_cv_prog_cc_g=no -fi -rm -f conftest* - -fi - -echo "$ac_t""$ac_cv_prog_cc_g" 1>&6 -if test "$ac_test_CFLAGS" = set; then - CFLAGS="$ac_save_CFLAGS" -elif test $ac_cv_prog_cc_g = yes; then - if test "$GCC" = yes; then - CFLAGS="-g -O2" - else - CFLAGS="-g" - fi -else - if test "$GCC" = yes; then - CFLAGS="-O2" - else - CFLAGS= - fi -fi - - SHLIB_CFLAGS="" - SHLIB_LD="shared-linker" - SHLIB_SUFFIX="" - SHLIB_VERSION="" - # Extract the first word of "ranlib", so it can be a program name with args. -set dummy ranlib; ac_word=$2 -echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:1183: checking for $ac_word" >&5 -if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then - echo $ac_n "(cached) $ac_c" 1>&6 -else - if test -n "$RANLIB"; then - ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. -else - IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" - ac_dummy="$PATH" - for ac_dir in $ac_dummy; do - test -z "$ac_dir" && ac_dir=. - if test -f $ac_dir/$ac_word; then - ac_cv_prog_RANLIB="ranlib" - break - fi - done - IFS="$ac_save_ifs" - test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":" -fi -fi -RANLIB="$ac_cv_prog_RANLIB" -if test -n "$RANLIB"; then - echo "$ac_t""$RANLIB" 1>&6 -else - echo "$ac_t""no" 1>&6 -fi - - echo $ac_n "checking for Tcl""... $ac_c" 1>&6 -echo "configure:1211: checking for Tcl" >&5 - TCL_VERSION=nope - for i in "" 7.4 7.5 7.6 8.0; do - if test -r ${tryprefix}/lib/libtcl${i}.a; then - TCL_VERSION=$i - fi - done - if test "$TCL_VERSION" = nope; then - echo "configure: warning: Didn't find Tcl" 1>&2 - TCL_LIB_SPEC=-ltcl - TCLINC=" # -I.. Tcl/Tk include path" - else - if test "$TCL_VERSION" = ""; then - echo "$ac_t""7.3" 1>&6 - else - echo "$ac_t""$TCL_VERSION" 1>&6 - fi - TCL_LIB_SPEC=${tryprefix}/lib/libtcl${TCL_VERSION}.a - TCLINC=-I${tryprefix}/include - fi - TCLLIB="${TCL_LIB_SPEC} ${LIBS} -lm" + echo "$ac_t""Not found" 1>&6 fi trap '' 1 2 15 cat > confcache <<\EOF @@ -1391,7 +1122,6 @@ s%@TCLSH8.1@%$TCLSH8.1%g s%@TCLSH8.0@%$TCLSH8.0%g s%@TCLSH7.6@%$TCLSH7.6%g s%@TCLSH7.5@%$TCLSH7.5%g -s%@RANLIB@%$RANLIB%g CEOF EOF diff --git a/configure.in b/configure.in index ccba64e..017b51b 100644 --- a/configure.in +++ b/configure.in @@ -1,6 +1,6 @@ -dnl (c) Index Data 1996-2000 +dnl (c) Index Data 1996-2001 dnl See the file LICENSE for details. -dnl $Id: configure.in,v 1.6 2001/01/23 12:05:07 adam Exp $ +dnl $Id: configure.in,v 1.7 2001/10/26 13:26:11 adam Exp $ AC_INIT(hswitch.c) dnl ------ Substitutions AC_SUBST(CC) @@ -15,7 +15,7 @@ dnl dnl ------ look for Tcl tclconfig=NONE tryprefix=NONE -AC_ARG_WITH(tclconfig, [ --with-tclconfig Path for tclConfig.sh/tkConfig.sh], [tclconfig=$withval]) +AC_ARG_WITH(tclconfig, [ --with-tclconfig Path for tclConfig.sh], [tclconfig=$withval]) if test "x$tclconfig" = xNONE; then saveprefix=${prefix} AC_PREFIX_PROGRAM(tclsh) @@ -44,11 +44,19 @@ if test "x$tclconfig" = xNONE; then tryprefix=${prefix} prefix=${saveprefix} fi -if test -r ${tclconfig}/tclConfig.sh; then - AC_MSG_CHECKING(for Tcl) - . ${tclconfig}/tclConfig.sh +AC_MSG_CHECKING(for Tcl) +if test -d ${tclconfig}; then + tclconfig=${tclconfig}/tclConfig.sh +fi +if test -r ${tclconfig}; then + . ${tclconfig} TCLLIB="${TCL_LIB_SPEC} ${TCL_LIBS}" - TCLINC=-I${TCL_PREFIX}/include + + if test -d ${TCL_PREFIX}/include/tcl${TCL_VERSION}; then + TCLINC=-I${TCL_PREFIX}/include/tcl${TCL_VERSION} + else + TCLINC=-I${TCL_PREFIX}/include + fi RANLIB=$TCL_RANLIB SHLIB_CFLAGS=$TCL_SHLIB_CFLAGS SHLIB_LD=$TCL_SHLIB_LD @@ -57,32 +65,6 @@ if test -r ${tclconfig}/tclConfig.sh; then AC_MSG_RESULT($TCL_VERSION) CC=$TCL_CC else - AC_PROG_CC - SHLIB_CFLAGS="" - SHLIB_LD="shared-linker" - SHLIB_SUFFIX="" - SHLIB_VERSION="" - AC_PROG_RANLIB - AC_MSG_CHECKING(for Tcl) - TCL_VERSION=nope - for i in "" 7.4 7.5 7.6 8.0; do - if test -r ${tryprefix}/lib/libtcl${i}.a; then - TCL_VERSION=$i - fi - done - if test "$TCL_VERSION" = nope; then - AC_MSG_WARN(Didn't find Tcl) - TCL_LIB_SPEC=-ltcl - TCLINC=" # -I.. Tcl/Tk include path" - else - if test "$TCL_VERSION" = ""; then - AC_MSG_RESULT(7.3) - else - AC_MSG_RESULT($TCL_VERSION) - fi - TCL_LIB_SPEC=${tryprefix}/lib/libtcl${TCL_VERSION}.a - TCLINC=-I${tryprefix}/include - fi - TCLLIB="${TCL_LIB_SPEC} ${LIBS} -lm" + AC_MSG_RESULT(Not found) fi AC_OUTPUT(Makefile) diff --git a/robot.tcl b/robot.tcl index c9388bc..7c3dbe6 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.20 2001/06/29 22:25:55 adam Exp $ +# $Id: robot.tcl,v 1.21 2001/10/26 13:26:11 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -50,7 +50,7 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {area} { - global robotSeq global idleTime ns + global robotSeq global idletime ns # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { @@ -67,7 +67,7 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "------------ N E X T R O U N D --------" + puts "Round robin" return wait } incr robotSeq @@ -201,7 +201,7 @@ proc RobotRestart {url sock} { proc RobotStart {} { global URL - global robotsRunning robotsMax idleTime + global robotsRunning robotsMax idletime # puts "RobotStart" while {1} { @@ -211,7 +211,7 @@ proc RobotStart {} { } incr robotsRunning if {[string compare $url wait] == 0} { - after $idleTime RobotRR + after $idletime RobotRR return } set r [RobotGetUrl $url {}] @@ -254,12 +254,14 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } if {[string first { } $href] >= 0} { return 0 @@ -344,14 +346,17 @@ proc RobotHref {url hrefx hostx pathx} { } regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href" - return 1 + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { @@ -417,10 +422,10 @@ proc RobotRedirect {url tourl code} { } proc RobotTextHtml {url out} { - global URL maxDistance + global URL maxdistance set distance 0 - if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { set distance [expr $URL($url,dist) + 1] } htmlSwitch $URL($url,buf) \ @@ -446,7 +451,7 @@ proc RobotTextHtml {url out} { puts "no href" continue } - if {[expr $distance <= $maxDistance]} { + if {[expr $distance <= $maxdistance]} { set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue @@ -496,7 +501,7 @@ proc RobotTextHtml {url out} { puts "no href" continue } - if {[expr $distance <= $maxDistance]} { + if {[expr $distance <= $maxdistance]} { set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue @@ -599,6 +604,11 @@ proc Robot200 {url} { puts -nonewline $out $URL($url,buf) RobotFileClose $out + if {![checkrule mime $URL($url,head,content-type)]} { + RobotError $url mimedeny + return + } + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] puts $out "" @@ -615,19 +625,19 @@ proc Robot200 {url} { headSave $url $out puts "Parsing $url distance=$distance" switch $URL($url,head,content-type) { - text/html { - if {[string length $distance]} { - RobotTextHtml $url $out - } - } - text/plain { - RobotTextPlain $url $out - } - application/pdf { - set pdff [open test.pdf w] - puts -nonewline $pdff $URL($url,buf) - close $pdff - } + text/html { + if {[string length $distance]} { + RobotTextHtml $url $out + } + } + text/plain { + RobotTextPlain $url $out + } + application/pdf { + set pdff [open test.pdf w] + puts -nonewline $pdff $URL($url,buf) + close $pdff + } } puts $out "" RobotFileClose $out @@ -654,9 +664,11 @@ proc RobotReadContent {url sock binary} { } proc RobotReadHeader {url sock} { - global URL + global URL debuglevel - puts "RobotReadHeader $url" + if {$debuglevel > 1} { + puts "HTTP head $url" + } if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 RobotRestart $url $sock @@ -685,7 +697,7 @@ proc RobotReadHeader {url sock} { set URL($url,head,[string tolower $name]) [string trim $value] } } - puts "code = $code" + puts "HTTP CODE $code" set URL($url,state) skip switch $code { 301 { @@ -747,7 +759,7 @@ proc RobotNop {} { proc RobotGetUrl {url phost} { global URL robotsRunning flush stdout - puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url" + puts "Retrieve $robotsRunning url=$url" if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } @@ -803,7 +815,7 @@ if {![llength [info commands htmlSwitch]]} { } } -set agent "zmbot/0.0" +set agent "zmbot/0.1" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" } @@ -819,29 +831,91 @@ proc bgerror {m} { set robotsRunning 0 set robotSeq 0 set workdir [pwd] -set idleTime 60000 +set idletime 60000 set acceptLanguage {} set i 0 set l [llength $argv] -# For testing only -if {0} { - set url "http://www.sportsfiskeren.dk/sportsfiskeren/corner/index.htm" - set href "../../data/../../data2/newsovs.asp?Mode=5" - - set URL($url,path) /sportsfiskeren/corner/index.htm - set URL($url,hostport) www.sportsfiskeren.dk - RobotHref $url href host path - exit 0 -} - if {$l < 2} { - puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]} + puts {tclrobot: usage:} + puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]} puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" exit 1 } +# Rules: allow, deny, url +set debuglevel 0 + +proc checkrule {type this} { + global alrules + global debuglevel + + if {$debuglevel > 3} { + puts "CHECKRULE $type $this" + } + if {[info exist alrules]} { + foreach l $alrules { + if {$debuglevel > 3} { + puts "consider $l" + } + # consider type + if {[lindex $l 1] != $type} continue + # consider mask + if {![string match [lindex $l 2] $this]} continue + # OK, we have a match + if {[lindex $l 0] == "allow"} { + if {$debuglevel > 3} { + puts "CHECKRULE MATH OK" + } + return 1 + } else { + if {$debuglevel > 3} { + puts "CHECKFULE MATCH FAIL" + } + return 0 + } + } + } + if {$debuglevel > 3} { + puts "CHECKRULE MATH OK" + } + return 1 +} + + +proc url {href} { + global debuglevel + + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } +} + +proc deny {type stuff} { + global alrules + + lappend alrules [list deny $type $stuff] +} + +proc allow {type stuff} { + global alrules + + lappend alrules [list allow $type $stuff] +} + +proc debug {level} { + global debuglevel + + set debuglevel $level +} + +# Parse options + while {$i < $l} { set arg [lindex $argv $i] switch -glob -- $arg { @@ -852,9 +926,9 @@ while {$i < $l} { } } -c* { - set maxDistance [string range $arg 2 end] - if {![string length $maxDistance]} { - set maxDistance [lindex $argv [incr i]] + set maxdistance [string range $arg 2 end] + if {![string length $maxdistance]} { + set maxdistance [lindex $argv [incr i]] } } -d* { @@ -865,9 +939,9 @@ while {$i < $l} { lappend domains $dom } -i* { - set idleTime [string range $arg 2 end] - if {![string length $idleTime]} { - set idleTime [lindex $argv [incr i]] + set idletime [string range $arg 2 end] + if {![string length $idletime]} { + set idletime [lindex $argv [incr i]] } } -l* { @@ -876,6 +950,13 @@ while {$i < $l} { set acceptLanguage [lindex $argv [incr i]] } } + -r* { + set rfile [string range $arg 2 end] + if {![string length $rfile]} { + set rfile [lindex $argv [incr i]] + } + source $rfile + } default { set href $arg if {[RobotHref http://www.indexdata.dk/ href host path]} { @@ -893,15 +974,15 @@ while {$i < $l} { if {![info exist domains]} { set domains {*} } -if {![info exist maxDistance]} { - set maxDistance 50 +if {![info exist maxdistance]} { + set maxdistance 50 } if {![info exist robotsMax]} { set robotsMax 5 } puts "domains=$domains" -puts "max distance=$maxDistance" +puts "max distance=$maxdistance" puts "max jobs=$robotsMax" RobotStart diff --git a/rules b/rules new file mode 100644 index 0000000..a2d0ce7 --- /dev/null +++ b/rules @@ -0,0 +1,15 @@ +# sample rules $Id: rules,v 1.1 2001/10/26 13:26:11 adam Exp $ + +url http://www.indexdata.dk + +allow url http://www.indexdata.dk/software* +allow url http://www.indexdata.dk/yaz* +allow url http://www.indexdata.dk/ +deny url * + +allow mime text/html +allow mime application/pdf +deny mime text/plain +deny mime * + +set maxdistance 4