Perlfect Solutions
 

[Perlfect-search] Robots.txt, for 3.30

Jerrad Pierce belg4mit@MIT.EDU
Mon, 08 Apr 2002 23:42:21 -0400
This adds support for adherence to robots.txt for 3.30
(same patch as before, but that was against 3.20).

This requires LWP. You do not have to be doing http indexing,
but you must have LWP to take advantage of this feature.

Apply the patch and simply add a $ROBOT_AGENT to your conf.pl.
The value of this variable is the agent the indexer will check
against in the robots.txt. (To disable, set the to false)

Finally this adds one minor feature in that if your IGNORE*
variable are not true (0, '', or commented out), the indexer
does not try to strip them, and should thusly run faster still.

NOTE: there are 3 long lines in there, make sure they aren't wrapped.

-  ${$buffer}
+  warn("Not
   # For PDF

=cut

--- /tmp/search-3.30/indexer.pl Tue Mar 19 14:17:35 2002
+++ indexer.pl  Tue Apr  9 03:22:54 2002
@@ -129,6 +129,25 @@

 print "Building string of special characters...\n";
 build_char_string();
+if( $ROBOT_AGENT ){
+  print "Loading robots.txt...\n";
+  eval "use WWW::RobotRules";
+  warn("Not obeying robots.txt, do you have libwww installed? $!") && ($ROBOT_AGENT = '') if $@;
+  $ROBOT = WWW::RobotRules->new($ROBOT_AGENT);
+  my($robots_txt, $url);
+  if( $HTTP_START_URL ) {
+    my $http_user_agent = LWP::UserAgent->new;
+    (undef, $robots_txt) = get_url($url=$HTTP_START_URL.'/robots.txt');
+  }
+  else{
+    open(ROBOT, $url="$DOCUMENT_ROOT/robots.txt") ||
+      warn("Could not fetch robots.txt: $!");
+    read(ROBOT, $robots_txt, -s $url);
+    $url = 'http://localhost/robots.txt';
+  }
+  #Give some - output here like no index?
+  $ROBOT->parse($url, $robots_txt) if $robots_txt;
+}
 print "Loading \'no index\' regular expressions:\n";
 load_excludes();
 print "Loading stopwords...";
@@ -309,7 +328,8 @@
 sub normalize {
   my $buffer = $_[0];

-  ${$buffer} =~ s/$IGNORE_TEXT_START.*?$IGNORE_TEXT_END//gis;  # strip user defined parts
+  if( $IGNORE_TEXT_START ){# strip user defined parts
+    ${$buffer} =~ s/$IGNORE_TEXT_START.*?$IGNORE_TEXT_END//gis; }
   ${$buffer} =~ s/<!--.*?-->//gis;  # strip html comments
   ${$buffer} =~ s/-(\s*\n\s*)?//g;  # join parts of hyphenated words

@@ -514,6 +534,13 @@
       return "listed in no_index.txt";
     }
   }
+  if( $ROBOT_AGENT ){
+    my $file = "http://localhost$file_relative" unless $HTTP_START_URL;
+    unless( $ROBOT->allowed($file) ){
+      print STDERR "'$file': robots.txt forbids indexing\n" if( $HTTP_DEBUG );
+      return "disallowed by robots.txt";
+    }
+  }
   # For PDF files check filename for security reasons (it later gets handed to a shell!):
   if( isPDF($file) && $PDFTOTEXT ) {
     if( $file !~ m/^[\/\\a-zA-Z0-9_.:+-]*$/ || $file =~ m/\.\./ ) {
@@ -640,4 +667,5 @@
   $zz = $TMP_DIR;
   $zz = $CONTENT_DB_FILE;
   $zz = $INDEX_NUMBERS;
+  $zz = $HTTP_DEBUG;
 }
-- 
H4sICNoBwDoAA3NpZwA9jbsNwDAIRHumuC4NklvXTOD0KSJEnwU8fHz4Q8M9i3sGzkS7BBrm
OkCTwsycb4S3DloZuMIYeXpLFqw5LaMhXC2ymhreVXNWMw9YGuAYdfmAbwomoPSyFJuFn2x8
Opr8bBBidccAAAA=
--
MOTD on Pungenday, the 25th of Discord, in the YOLD 3168:
Je suis un chou-fleur sous un echequier enorme.