Perlfect Solutions
 

[Perlfect-search] [PATCH] Indexer patches

Vlad Romanenko perlfect-search@perlfect.com
Mon, 30 Sep 2002 14:01:19 +0300
This is a multi-part message in MIME format.

------=_NextPart_000_002F_01C26889.D9E4E460
Content-Type: text/plain;
        charset="US-ASCII"
Content-Transfer-Encoding: 7bit

> From: Daniel Naber [mailto:daniel.naber@t-online.de] 
> I've now set up a page with descriptions on how to get the 
> source: http://www.danielnaber.de/perlfectsearch/cvs.php

Daniel, thank you very much!

I'd like to contribute some more patches:
1) search_form.html: chmod a-x search_form.html
2) tools.pl: get_url(): Check for request error first and for
content-type second.
Attached tools.pl.patch
3) indexer_filesystem.pl: crawl_filesystem(): Use to_be_ignored() to
check if directory is in @no_index list - in order to execute
cut_document_root() on directory name. Otherwise directories aren't
properly ignored.
Attached indexer_filesystem.pl.patch

Regards,
Vlad.

------=_NextPart_000_002F_01C26889.D9E4E460
Content-Type: application/octet-stream;
        name="tools.pl.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
        filename="tools.pl.patch"

--- tools.pl    Mon Sep 30 12:22:18 2002=0A=
+++ /home/vlad/my/work/itcs-com-au/cgi-bin/search/tools.pl      Mon Sep 30 =
12:20:39 2002=0A=
@@ -36,17 +36,21 @@=0A=
 =0A=
   my $request =3D HTTP::Request->new(GET =3D> $url);=0A=
   my $response =3D $http_user_agent->request($request);=0A=
-  my $buffer =3D $response->content;=0A=
-  my ($content_type) =3D ($response->headers_as_string =3D~ =
m/^Content-Type:\s*(.+)$/im);=0A=
-  $content_type =3D~ s/^(.*?);.*$/$1/;         # ignore possible charset value=0A=
-  if( ! grep(/^$content_type$/i, @HTTP_CONTENT_TYPES) ) {=0A=
-    print STDERR "Ignoring '$url': content-type '$content_type'\n" if( =
$HTTP_DEBUG );=0A=
-    return;=0A=
-  }=0A=
   if( $response->is_error ) {=0A=
     print STDERR "Error: Couldn't get '$url': response code " =
.$response->code. "\n";=0A=
     return;=0A=
   }=0A=
+=0A=
+  if( $response->headers_as_string =3D~ m/^Content-Type:\s*(.+)$/im ) {=0A=
+    my $content_type =3D $1;=0A=
+    $content_type =3D~ s/^(.*?);.*$/$1/;               # ignore possible charset =
value=0A=
+    if( ! grep(/^$content_type$/i, @HTTP_CONTENT_TYPES) ) {=0A=
+      print STDERR "Ignoring '$url': content-type '$content_type'\n" =
if( $HTTP_DEBUG );=0A=
+      return;=0A=
+    }=0A=
+  }=0A=
+=0A=
+  my $buffer =3D $response->content;=0A=
   my $size =3D length($buffer);=0A=
   print STDERR "Fetched  '$url', $size bytes\n" if( $HTTP_DEBUG );=0A=
   # Maybe we are we redirected, so use the new URL.=0A=

------=_NextPart_000_002F_01C26889.D9E4E460
Content-Type: application/octet-stream;
        name="indexer_filesystem.pl.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
        filename="indexer_filesystem.pl.patch"

--- indexer_filesystem.pl       Mon Sep 30 12:22:18 2002=0A=
+++ /home/vlad/my/work/itcs-com-au/cgi-bin/search/indexer_filesystem.pl =
Mon Sep 30 12:47:45 2002=0A=
@@ -28,7 +28,7 @@=0A=
   my $dir =3D $_[0];=0A=
   my $doc_id;=0A=
   my $file;=0A=
-  =0A=
+=0A=
   print $dir,"\n";=0A=
 =0A=
   chdir $dir or (warn "Cannot chdir $dir: $!" and return);=0A=
@@ -39,7 +39,7 @@=0A=
   # to ignore symbolic links, add "and not -l" to both greps:=0A=
   my @dirs  =3D grep {-d and not /^\.{1,2}$/} @contents; =0A=
   my @files =3D grep {-f and /^.+\.(.+)$/ and grep {/^\Q$1\E$/} @EXT} =
@contents;=0A=
-  =0A=
+=0A=
   FILE: foreach my $f (@files) {=0A=
     $file =3D $dir."/".$f;=0A=
     $file =3D~ s/\/\//\//og;=0A=
@@ -79,10 +79,9 @@=0A=
   DIR: foreach my $d (@dirs) {=0A=
     $file =3D $dir."/".$d;=0A=
     $file =3D~ s/\/\//\//og;=0A=
-    =0A=
-    foreach my $regexp (@no_index) {=0A=
-      next DIR if $file =3D~ /^$regexp$/;=0A=
-    }=0A=
+=0A=
+    next DIR if( to_be_ignored($file) );=0A=
+=0A=
     crawl_filesystem($file);=0A=
   }=0A=
 }=0A=

------=_NextPart_000_002F_01C26889.D9E4E460--