#!/usr/bin/perl
# grab - takes a url, finds all the files of a 
# certain type linked to from
# that url, then downloads all of those.
# then it makes an index of them.
# steev hise, steev AT datamassage.com
# been tweaking this since at least early 1999...
# this code is licensed to all via the GNU GPL.
# see http://www.fsf.org/licenses/licenses.html#GPL
# for more information.
#
# $Id: grab,v 1.4 2001/12/12 17:24:14 steev Exp $
#
# $Log: grab,v $
# Revision 1.4  2001/12/12 17:24:14  steev
# made option handling better. added auto-directory creation.
#
# Revision 1.3  2001/10/30 07:12:03  steev
# added non-indexing option.
#
# Revision 1.2  2001/08/27 05:47:11  steev
# modified to use LWP. Also fakes referer. works beautifully.
#
#
###########################################################

use Getopt::Std;
use File::Path;

$| = 1;
require "flush.pl";
#$lynx = "/usr/bin/lynx -useragent=Mozilla/4.7 ";

getopts('e:r:i');

if($opt_e) {
  $extension = $opt_e;
} else {
  $extension = "jpg";
}

$origurl = shift @ARGV;
$localdir = shift @ARGV;

# several things possible to do with destination directory.
if(!$localdir) {
	$localdir = '.';
} elsif(! -e $localdir) { 
	mkpath($localdir) || die "error creating $localdir: $!";
} elsif(! -d $localdir) {
	die "$localdir is not a directory";
}

# if we're given a range of numbers then we tack those onto
# the end of the original url.
if($opt_r) {
	($min,$max) = split /-/, $opt_r;
	for $i ($min..$max) {
   	     push @urls, $origurl . $i . ".$extension";
	}
} else {
	#  or we get that page.
	print "getting page " , $origurl , "....\n";
        $page = &webget($origurl);
        open(OUT, ">/tmp/tmp.html");
        print OUT $page;
	# `$lynx -source '$origurl' > /tmp/tmp.html`;
	print STDOUT "grabbed html\n";

	$origurl =~ m#(http://[^/]*)/*([^ ]*)#;
	$site = $1;
	$file = "/".$2;

	@path = split('/', $file);
	# who cares about the original file name.
	# however, if url is just a directory, keep whole thing.
	if($file !~ /\/$/) { pop @path; }  
	$path = join('/', @path);

	print STDOUT "Scanning page $origurl for files ending in .$extension.\n";

	#  remove linebreaks cuz they can fuck stuff up.
	
	$page =~ s/\n|\r//g;      # remove linebreaks
	
	# now scan the page for images.
	@imgs = split(/<a/i, $page);
	foreach $img(@imgs) {
	   if ($img =~ /href\s*=\s*["']*\s*(\S*?)\.($extension)\s*["']*.*?>/i) {
		 $url = "$1.$2";
		 # its either relative, absolute, or full
		 #first, full 
		 if ($url =~ /^http/i) {
		   push @urls, $url;  # add that url unchanged
		   next;
		 } 
		 elsif ($url =~ /^\//) {
		   push @urls, "$site$url";
		   next;
		 }
		 else {   # must be a relative
		   push @urls, "$site$path/$url";
		   next;
		 }
	    }
	 }
}  # one way or the other we have a list of urls to get.

$count = int(@urls);
print  STDOUT "\nDone scanning. Found $count files\n";
#unlink "/tmp/tmp.html";  

die "Didn't find any files.\n"  unless @urls;

foreach $image (@urls) {
  $image =~ s/\s//g;   # remove spaces in url
  $i++;
  @path = split('/', $image);
  $local = $path[$#path];   # get just the filename
  # if local file already exists, add time to it.
  if (-e "$localdir/$local") { $local = time . $local; }
  print STDOUT  "  $i: $image -- ";
  flush(STDOUT);
  open(IMG, ">$localdir/$local") || die "can't write to $localdir/$local:$!\n";
  print IMG &webget($image, $origurl);
  #`$lynx -source '$image' > $localdir/$local`;
  $size = -s "$localdir/$local";
  if($size > 500) {
    print STDOUT "saving to $localdir/$local",
    " size: $size.\n";
    $totalsize = $totalsize + (-s "$localdir/$local");
  } else {
    print STDOUT "ERROR.\n\n";
    push @errors, $image;
    unlink "$localdir/$local";   # if it's too small, then delete.
  }
  flush(STDOUT);
}

print "total bytes downloaded: $totalsize.\n";
if(@errors) {
  print "the following files couldnt be downloaded:\n";
  print join("\n  ", @errors);
  print "\n\n";
}

# now count what we have and make an index
# but not if the -i switch was given.
if($opt_i) { 
  print "\ndone.\n";
  exit;
}

print "Building index.html.\n";
opendir(DIR, $localdir);
@images = sort grep(/$extension$/i, readdir(DIR));

open(FILE, ">$localdir/index.html") || die "no open $localdir/index.html";
print FILE <<"EndOT";
<html>
<head><title>$origurl</title></head>
<body>
original url: <a href="$origurl">$origurl</a> <br>
<ul>
EndOT

for $n (@images) {
	print FILE "<li><a target=window href=\"$n\">$n</a>\n";
}

print FILE "</ul><p><a target=window href=../>up</a>\n </html>";
close FILE;

print "done.\n\n";

# subroutines ########

# this just grabs a url, instead of using Lynx.
# give it a url and optionally, a referrer.
sub webget {
   use LWP::UserAgent;
   use HTTP::Request;
   use HTTP::Response;
   my ($url,$referer) = @_;
   unless(defined $referer) { $referer = 'http://disney.com' };
   $| = 1;                                                                          # to flush next line
   # printf "%s =>\n\t", $url;
   my $ua = LWP::UserAgent->new();
   $ua->agent("Mozilla/4.7");     # pretend, just in case
   my $req = HTTP::Request->new(GET => $url);
   $req->referer($referer);
   my $response = $ua->request($req);
   if ($response->is_error()) {
        warn " %s\n", $response->status_line;
        return 0;
    } else {
        my $content = $response->content();
        return $content;
    }
}