|
|
[Perlfect-search] [PATCH] Indexer patches
Vlad Romanenko perlfect-search@perlfect.com
Mon, 30 Sep 2002 14:01:19 +0300
This is a multi-part message in MIME format.
------=_NextPart_000_002F_01C26889.D9E4E460
Content-Type: text/plain;
charset="US-ASCII"
Content-Transfer-Encoding: 7bit
> From: Daniel Naber [mailto:daniel.naber@t-online.de]
> I've now set up a page with descriptions on how to get the
> source: http://www.danielnaber.de/perlfectsearch/cvs.php
Daniel, thank you very much!
I'd like to contribute some more patches:
1) search_form.html: chmod a-x search_form.html
2) tools.pl: get_url(): Check for request error first and for
content-type second.
Attached tools.pl.patch
3) indexer_filesystem.pl: crawl_filesystem(): Use to_be_ignored() to
check if directory is in @no_index list - in order to execute
cut_document_root() on directory name. Otherwise directories aren't
properly ignored.
Attached indexer_filesystem.pl.patch
Regards,
Vlad.
------=_NextPart_000_002F_01C26889.D9E4E460
Content-Type: application/octet-stream;
name="tools.pl.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
filename="tools.pl.patch"
--- tools.pl Mon Sep 30 12:22:18 2002=0A=
+++ /home/vlad/my/work/itcs-com-au/cgi-bin/search/tools.pl Mon Sep 30 =
12:20:39 2002=0A=
@@ -36,17 +36,21 @@=0A=
=0A=
my $request =3D HTTP::Request->new(GET =3D> $url);=0A=
my $response =3D $http_user_agent->request($request);=0A=
- my $buffer =3D $response->content;=0A=
- my ($content_type) =3D ($response->headers_as_string =3D~ =
m/^Content-Type:\s*(.+)$/im);=0A=
- $content_type =3D~ s/^(.*?);.*$/$1/; # ignore possible charset value=0A=
- if( ! grep(/^$content_type$/i, @HTTP_CONTENT_TYPES) ) {=0A=
- print STDERR "Ignoring '$url': content-type '$content_type'\n" if( =
$HTTP_DEBUG );=0A=
- return;=0A=
- }=0A=
if( $response->is_error ) {=0A=
print STDERR "Error: Couldn't get '$url': response code " =
.$response->code. "\n";=0A=
return;=0A=
}=0A=
+=0A=
+ if( $response->headers_as_string =3D~ m/^Content-Type:\s*(.+)$/im ) {=0A=
+ my $content_type =3D $1;=0A=
+ $content_type =3D~ s/^(.*?);.*$/$1/; # ignore possible charset =
value=0A=
+ if( ! grep(/^$content_type$/i, @HTTP_CONTENT_TYPES) ) {=0A=
+ print STDERR "Ignoring '$url': content-type '$content_type'\n" =
if( $HTTP_DEBUG );=0A=
+ return;=0A=
+ }=0A=
+ }=0A=
+=0A=
+ my $buffer =3D $response->content;=0A=
my $size =3D length($buffer);=0A=
print STDERR "Fetched '$url', $size bytes\n" if( $HTTP_DEBUG );=0A=
# Maybe we are we redirected, so use the new URL.=0A=
------=_NextPart_000_002F_01C26889.D9E4E460
Content-Type: application/octet-stream;
name="indexer_filesystem.pl.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
filename="indexer_filesystem.pl.patch"
--- indexer_filesystem.pl Mon Sep 30 12:22:18 2002=0A=
+++ /home/vlad/my/work/itcs-com-au/cgi-bin/search/indexer_filesystem.pl =
Mon Sep 30 12:47:45 2002=0A=
@@ -28,7 +28,7 @@=0A=
my $dir =3D $_[0];=0A=
my $doc_id;=0A=
my $file;=0A=
- =0A=
+=0A=
print $dir,"\n";=0A=
=0A=
chdir $dir or (warn "Cannot chdir $dir: $!" and return);=0A=
@@ -39,7 +39,7 @@=0A=
# to ignore symbolic links, add "and not -l" to both greps:=0A=
my @dirs =3D grep {-d and not /^\.{1,2}$/} @contents; =0A=
my @files =3D grep {-f and /^.+\.(.+)$/ and grep {/^\Q$1\E$/} @EXT} =
@contents;=0A=
- =0A=
+=0A=
FILE: foreach my $f (@files) {=0A=
$file =3D $dir."/".$f;=0A=
$file =3D~ s/\/\//\//og;=0A=
@@ -79,10 +79,9 @@=0A=
DIR: foreach my $d (@dirs) {=0A=
$file =3D $dir."/".$d;=0A=
$file =3D~ s/\/\//\//og;=0A=
- =0A=
- foreach my $regexp (@no_index) {=0A=
- next DIR if $file =3D~ /^$regexp$/;=0A=
- }=0A=
+=0A=
+ next DIR if( to_be_ignored($file) );=0A=
+=0A=
crawl_filesystem($file);=0A=
}=0A=
}=0A=
------=_NextPart_000_002F_01C26889.D9E4E460--
|
|