#!/usr/bin/perl
# grab - takes a url, finds all the files of a
# certain type linked to from
# that url, then downloads all of those.
# then it makes an index of them.
# steev hise, steev AT datamassage.com
# been tweaking this since at least early 1999...
# this code is licensed to all via the GNU GPL.
# see http://www.fsf.org/licenses/licenses.html#GPL
# for more information.
#
# $Id: grab,v 1.4 2001/12/12 17:24:14 steev Exp $
#
# $Log: grab,v $
# Revision 1.4 2001/12/12 17:24:14 steev
# made option handling better. added auto-directory creation.
#
# Revision 1.3 2001/10/30 07:12:03 steev
# added non-indexing option.
#
# Revision 1.2 2001/08/27 05:47:11 steev
# modified to use LWP. Also fakes referer. works beautifully.
#
#
###########################################################
use Getopt::Std;
use File::Path;
$| = 1;
require "flush.pl";
#$lynx = "/usr/bin/lynx -useragent=Mozilla/4.7 ";
getopts('e:r:i');
if($opt_e) {
$extension = $opt_e;
} else {
$extension = "jpg";
}
$origurl = shift @ARGV;
$localdir = shift @ARGV;
# several things possible to do with destination directory.
if(!$localdir) {
$localdir = '.';
} elsif(! -e $localdir) {
mkpath($localdir) || die "error creating $localdir: $!";
} elsif(! -d $localdir) {
die "$localdir is not a directory";
}
# if we're given a range of numbers then we tack those onto
# the end of the original url.
if($opt_r) {
($min,$max) = split /-/, $opt_r;
for $i ($min..$max) {
push @urls, $origurl . $i . ".$extension";
}
} else {
# or we get that page.
print "getting page " , $origurl , "....\n";
$page = &webget($origurl);
open(OUT, ">/tmp/tmp.html");
print OUT $page;
# `$lynx -source '$origurl' > /tmp/tmp.html`;
print STDOUT "grabbed html\n";
$origurl =~ m#(http://[^/]*)/*([^ ]*)#;
$site = $1;
$file = "/".$2;
@path = split('/', $file);
# who cares about the original file name.
# however, if url is just a directory, keep whole thing.
if($file !~ /\/$/) { pop @path; }
$path = join('/', @path);
print STDOUT "Scanning page $origurl for files ending in .$extension.\n";
# remove linebreaks cuz they can fuck stuff up.
$page =~ s/\n|\r//g; # remove linebreaks
# now scan the page for images.
@imgs = split(//i) {
$url = "$1.$2";
# its either relative, absolute, or full
#first, full
if ($url =~ /^http/i) {
push @urls, $url; # add that url unchanged
next;
}
elsif ($url =~ /^\//) {
push @urls, "$site$url";
next;
}
else { # must be a relative
push @urls, "$site$path/$url";
next;
}
}
}
} # one way or the other we have a list of urls to get.
$count = int(@urls);
print STDOUT "\nDone scanning. Found $count files\n";
#unlink "/tmp/tmp.html";
die "Didn't find any files.\n" unless @urls;
foreach $image (@urls) {
$image =~ s/\s//g; # remove spaces in url
$i++;
@path = split('/', $image);
$local = $path[$#path]; # get just the filename
# if local file already exists, add time to it.
if (-e "$localdir/$local") { $local = time . $local; }
print STDOUT " $i: $image -- ";
flush(STDOUT);
open(IMG, ">$localdir/$local") || die "can't write to $localdir/$local:$!\n";
print IMG &webget($image, $origurl);
#`$lynx -source '$image' > $localdir/$local`;
$size = -s "$localdir/$local";
if($size > 500) {
print STDOUT "saving to $localdir/$local",
" size: $size.\n";
$totalsize = $totalsize + (-s "$localdir/$local");
} else {
print STDOUT "ERROR.\n\n";
push @errors, $image;
unlink "$localdir/$local"; # if it's too small, then delete.
}
flush(STDOUT);
}
print "total bytes downloaded: $totalsize.\n";
if(@errors) {
print "the following files couldnt be downloaded:\n";
print join("\n ", @errors);
print "\n\n";
}
# now count what we have and make an index
# but not if the -i switch was given.
if($opt_i) {
print "\ndone.\n";
exit;
}
print "Building index.html.\n";
opendir(DIR, $localdir);
@images = sort grep(/$extension$/i, readdir(DIR));
open(FILE, ">$localdir/index.html") || die "no open $localdir/index.html";
print FILE <<"EndOT";
up\n "; close FILE; print "done.\n\n"; # subroutines ######## # this just grabs a url, instead of using Lynx. # give it a url and optionally, a referrer. sub webget { use LWP::UserAgent; use HTTP::Request; use HTTP::Response; my ($url,$referer) = @_; unless(defined $referer) { $referer = 'http://disney.com' }; $| = 1; # to flush next line # printf "%s =>\n\t", $url; my $ua = LWP::UserAgent->new(); $ua->agent("Mozilla/4.7"); # pretend, just in case my $req = HTTP::Request->new(GET => $url); $req->referer($referer); my $response = $ua->request($req); if ($response->is_error()) { warn " %s\n", $response->status_line; return 0; } else { my $content = $response->content(); return $content; } }