Perlfect Solutions
 

[Perlfect-search] Robots.txt, for 3.30

belg4mit@MIT.EDU belg4mit@MIT.EDU
Tue, 9 Apr 2002 21:07:24 -0400
> THIS IS A MESSAGE IN 'MIME' FORMAT.  Your mail reader does not support MIME.
> Some parts of this will be readable as plain text.
> To see the rest, you will need to upgrade your mail reader.

--PART.BOUNDARY.8029.7750.calloway.mit.edu.1018400844.1
Content-ID: <8029_7750_1018400844_2@calloway.mit.edu>
Content-type: text/richtext
Content-Transfer-Encoding: quoted-printable

* With a "real attachement" (what's wrong with cut-and-paste? ;-) *
<nl><nl>

This adds support for adherence to robots.txt for 3.30
(same patch as before, but that was against 3.20).
<nl><nl>

This requires LWP. You do not have to be doing http indexing,
but you must have LWP to take advantage of this feature.
<nl><nl>

Apply the patch and simply add a $ROBOT_AGENT to your conf.pl.
The value of this variable is the agent the indexer will check
against in the robots.txt. (To disable, set the to false)
<nl><nl>

Finally this adds one minor feature in that if your IGNORE*
variable are not true (0, '', or commented out), the indexer
does not try to strip them, and should thusly run faster still.
<nl><nl>

NOTE: there are 3 long lines in there, make sure they aren't wrapped.
<nl><nl>

-  ${$buffer}
+  warn("Not
<nl>   # For PDF

--PART.BOUNDARY.8029.7750.calloway.mit.edu.1018400844.1
Content-ID: <8029_7750_1018400844_3@calloway.mit.edu>
Content-type: text/plain; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable

--- /tmp/search-3.30/indexer.pl Tue Mar 19 14:17:35 2002
+++ indexer.pl  Tue Apr  9 03:22:54 2002
@@ -129,6 +129,25 @@

 print "Building string of special characters...\n";
 build_char_string();
+if( $ROBOT_AGENT ){
+  print "Loading robots.txt...\n";
+  eval "use WWW::RobotRules";
+  warn("Not obeying robots.txt, do you have libwww installed? $!") && ($=
ROBOT_AGENT =3D '') if $@;
+  $ROBOT =3D WWW::RobotRules->new($ROBOT_AGENT);
+  my($robots_txt, $url);
+  if( $HTTP_START_URL ) {
+    my $http_user_agent =3D LWP::UserAgent->new;
+    (undef, $robots_txt) =3D get_url($url=3D$HTTP_START_URL.'/robots.txt=
');
+  }
+  else{
+    open(ROBOT, $url=3D"$DOCUMENT_ROOT/robots.txt") ||
+      warn("Could not fetch robots.txt: $!");
+    read(ROBOT, $robots_txt, -s $url);
+    $url =3D 'http://localhost/robots.txt';
+  }
+  #Give some - output here like no index?
+  $ROBOT->parse($url, $robots_txt) if $robots_txt;
+}
 print "Loading \'no index\' regular expressions:\n";
 load_excludes();
 print "Loading stopwords...";
@@ -309,7 +328,8 @@
 sub normalize {
   my $buffer =3D $_[0];

-  ${$buffer} =3D~ s/$IGNORE_TEXT_START.*?$IGNORE_TEXT_END//gis;  # strip=
 user defined parts
+  if( $IGNORE_TEXT_START ){# strip user defined parts
+    ${$buffer} =3D~ s/$IGNORE_TEXT_START.*?$IGNORE_TEXT_END//gis; }
   ${$buffer} =3D~ s/<!--.*?-->//gis;  # strip html comments
   ${$buffer} =3D~ s/-(\s*\n\s*)?//g;  # join parts of hyphenated words

@@ -514,6 +534,13 @@
       return "listed in no_index.txt";
     }
   }
+  if( $ROBOT_AGENT ){
+    my $file =3D "http://localhost$file_relative" unless $HTTP_START_URL=
;
+    unless( $ROBOT->allowed($file) ){
+      print STDERR "'$file': robots.txt forbids indexing\n" if( $HTTP_DE=
BUG );
+      return "disallowed by robots.txt";
+    }
+  }
   # For PDF files check filename for security reasons (it later gets han=
ded to a shell!):
   if( isPDF($file) && $PDFTOTEXT ) {
     if( $file !~ m/^[\/\\a-zA-Z0-9_.:+-]*$/ || $file =3D~ m/\.\./ ) {
@@ -640,4 +667,5 @@
   $zz =3D $TMP_DIR;
   $zz =3D $CONTENT_DB_FILE;
   $zz =3D $INDEX_NUMBERS;
+  $zz =3D $HTTP_DEBUG;
 }

--PART.BOUNDARY.8029.7750.calloway.mit.edu.1018400844.1--