Added README. Ignore case in keywords in robots.txt.

author Adam Dickmeiss <adam@indexdata.dk>

Wed, 6 Jun 2001 07:10:31 +0000 (07:10 +0000)

committer Adam Dickmeiss <adam@indexdata.dk>

Wed, 6 Jun 2001 07:10:31 +0000 (07:10 +0000)
author Adam Dickmeiss <adam@indexdata.dk>
Wed, 6 Jun 2001 07:10:31 +0000 (07:10 +0000)
committer Adam Dickmeiss <adam@indexdata.dk>
Wed, 6 Jun 2001 07:10:31 +0000 (07:10 +0000)
diff --git a/README b/README

new file mode 100644 (file)

index 0000000..e7b1184
--- /dev/null
+++ b/README
@@ -0,0 +1,71 @@
+zmbot: a Simple Web harvesting robot for Z'mbol.
+
+Introduction
+
+  zmbot is a simple web harvester written in Tcl. The following
+  summaries the features:
+
+  o Simple administration. One script does the job and no external
+    database is required to operate.
+
+  o Interruptible. Harvesting may safely be stopped/interrupted at any
+    point.
+
+  o Gentle harvesting. By default a site is visited once per minute -
+    robots.txt honored.
+
+  o Concurrent harvesting (jobs) in one process and one thread.
+
+  o Inspects content-type header to determine structure of page.
+
+  o Written in Tcl and is quite portable. (Some may not think this as being
+    feature; Perl version is welcomed!).
+
+  o Creates simple XML output. One file per URL.
+
+  The robot is started from the command line and takes one or more URL's
+  as parameter(s). Options, prefixed with minus, alter the behaviour of
+  the harvesting. The following options are supported:
+
+   -j jobs    The maximum number of concurrent HTTP sessions; default 5 jobs.
+
+   -i idle    Idle time in microseconds between visits to the same site;
+              default 60000 = 60 seconds.
+
+   -c count   Maximum distance from original URL as given from the command
+              line; default 50. 
+
+
+   -d domain  Only sites matching domain are visited. The domain given is
+              a Tcl glob expression (.e.g *.somwhere.com). Remember to
+              quote the domain when given on the command line so that your
+              shell doesn't expand this. This option may be repeated thus
+              allowing you to specify many "allowed" domains.
+ 
+  Example 1: Harvest three links away from www.somwhere.com world-wide:
+   ./robot.tcl -c 3 http://www.somwhere.com/
+
+  Example 2: Harvest the site www.somwhere.com only:
+   ./robot.tcl -d www.somewhere.com http://www.somewhere.com/
+
+  Example 3: Harvest up to two click from www.a.dk and www.b.dk in dk-domain:
+   ./robot.tcl -d '*.dk' -c 2 http://www.a.dk/ http://www.b.dk/
+
+  The zmbot robot creates three directories, visited, unvisited, bad
+  for visited pages, unvisited pages, and bad pages respectively. The
+  visited area holds keywords and metadata for all successully retrieved
+  pages. The unvisited area serves as a "todo" list of pages to be visited
+  in the future. The bad area holds pages that for some reason cannot be
+  retrieved: non-existant, permission denied, robots.txt disallow, etc.
+
+Installation:
+
+  $  ./configure
+  $ make
+
+  The configure script looks for the Tcl shell, tclsh, to determine the
+  location of Tcl and its configuration file tclConfig.sh. To manually specify
+  Tcl's location, add --with-tclconfig and specify the directory where
+  tclConfig.sh is installed. For example:
+    ./configure --with-tclconfig=/usr/local/lib
+
diff --git a/robot.tcl b/robot.tcl

index 5bd9f82..5c2b518 100755 (executable)
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
  #!/usr/bin/tclsh 
-# $Id: robot.tcl,v 1.15 2001/06/05 08:44:50 adam Exp $
+# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $
  #
  proc RobotFileNext1 {area lead} {
      puts "RobotFileNext1 area=$area lead=$lead"
@@ -484,6 +484,56 @@ proc RobotTextHtml {url out} {
                     }
                 }
             }
+        } -nonest area {
+            if {![info exists parm(href)]} {
+               puts "no href"
+               continue
+            }
+           if {[expr $distance <= $maxDistance]} {
+               set href [string trim $parm(href)]
+               if {![RobotHref $url href host path]} continue
+               
+               puts $out "<cr>"
+               puts $out "<identifier>$href</identifier>"
+               puts $out "<description></description>"
+               puts $out "</cr>"
+
+               if {![RobotFileExist visited $host $path]} {
+                   set olddistance 1000
+                   if {![RobotFileExist bad $host $path]} {
+                       if {[RobotFileExist unvisited $host $path]} {
+                           set inf [RobotFileOpen unvisited $host $path r]
+                           RobotReadRecord $inf oldurl olddistance
+                           RobotFileClose $inf
+                       }
+                   } else {
+                       set olddistance 0
+                   }
+                   if {[string length $olddistance] == 0} {
+                       set olddistance 1000
+                   }
+                   if {[expr $distance < $olddistance]} {
+                       set outf [RobotFileOpen unvisited $host $path]
+                       RobotWriteRecord $outf $url $distance
+                       RobotFileClose $outf
+                   }
+               } elseif {[string compare $href $url]} {
+                   set inf [RobotFileOpen visited $host $path r]
+                   RobotReadRecord $inf xurl olddistance
+                   close $inf
+                   if {[string length $olddistance] == 0} {
+                       set olddistance 1000
+                   }
+                   if {[expr $distance < $olddistance]} {
+                       puts "OK remarking url=$url href=$href"
+                       puts "olddistance = $olddistance"
+                       puts "newdistance = $distance"
+                       set outf [RobotFileOpen unvisited $host $path]
+                       RobotWriteRecord $outf $url $distance
+                       RobotFileClose $outf
+                   }
+               }
+           }
         }
  }
  
@@ -497,21 +547,21 @@ proc RobotsTxt0 {v buf} {
      global URL agent
      set section 0
      foreach l [split $buf \n] {
-       if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
+       if {[regexp {([-A-Za-z]+):[ \t]*([^\#\t ]+)} $l match cmd arg]} {
             puts "cmd=$cmd arg=$arg"
-           switch $cmd {
-               User-Agent {
+           switch -- [string tolower $cmd] {
+               user-agent {
                     if {$section} break
                     set pat [string tolower $arg]*
                     set section [string match $pat $agent]
                 }
-               Disallow {
+               disallow {
                     if {$section} {
                         puts "rule [list 0 $arg]"
                         lappend $v [list 0 $arg]
                     }
                 }
-               Allow {
+               allow {
                     if {$section} {
                         puts "rule [list 1 $arg]"
                         lappend $v [list 1 $arg]
@@ -707,7 +757,7 @@ proc RobotGetUrl {url phost} {
                 set buf [read $inf 32768]
                 close $inf
             } else {
-               set buf "User-Agent: *\nAllow: /\n"
+               set buf "User-agent: *\nAllow: /\n"
             }
             RobotsTxt0 URL($hostport,robots) $buf
         }
@@ -720,6 +770,7 @@ proc RobotGetUrl {url phost} {
             }
         }
         if {!$ok} {
+           puts "skipped due to robots.txt"
             return -1
         }
      }
author	Adam Dickmeiss <adam@indexdata.dk>
	Wed, 6 Jun 2001 07:10:31 +0000 (07:10 +0000)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Wed, 6 Jun 2001 07:10:31 +0000 (07:10 +0000)
README	[new file with mode: 0644]	patch \| blob
robot.tcl		patch \| blob \| history