#GPL
#GPL  libwhisker copyright 2000,2001,2002 by rfp.labs
#GPL
#GPL  This program is free software; you can redistribute it and/or
#GPL  modify it under the terms of the GNU General Public License
#GPL  as published by the Free Software Foundation; either version 2
#GPL  of the License, or (at your option) any later version.
#GPL
#GPL  This program is distributed in the hope that it will be useful,
#GPL  but WITHOUT ANY WARRANTY; without even the implied warranty of
#GPL  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GPL  GNU General Public License for more details.
#GPL

=pod

=head1 ++ Sub package: crawl

Used for crawling a website by requesting a (start) page, reading the
HTML, extracting the links, and then requesting those links--up to a
specified depth.  The module also allows various configuration tweaks to
do such things as monitor requests for offsite URLs (pages on other
hosts), track various cookies, etc.

=cut

#####################################################

=pod

=head1 - Function: LW::crawl
  
Params: $START, $MAX_DEPTH, \%tracking, \%hin
Return: Nothing

The heart of the crawl package.  Will perform an HTTP crawl on the
specified HOST, starting at START URI, proceeding up to MAX_DEPTH.  A
tracking hash reference (required) stores the results of each page (and
ongoing progress).  The http_in_options hash reference specifies a
standard HTTP hash for use in the outgoing HTTP requests.  Certain options
are configurable via LW::crawl_set_config().  The tracking hash will
contain all the pages visited; you can get the crawl engine to skip pages
by placing them in the tracking hash ahead of time.

START (first) parameter should be of the form "http://www.host.com/url".

=cut

sub crawl {
 my ($START, $MAX_DEPTH, $hrtrack, $hrin)=@_;
 my (%hout, %jar);
 my ($T, @ST, @links, @tlinks, @vals, @ERRORS)=('');

 return if(!(defined $hrtrack && ref($hrtrack)));
 return if(!(defined $hrin    && ref($hrin)   )); 
 return if(!defined $START || length($START)==0);

 $MAX_DEPTH||=2;

 # $ST[0]=HOST  $ST[1]=URL  $ST[2]=CWD  $ST[3]=HTTPS  $ST[4]=SERVER
 # $ST[5]=PORT  $ST[6]=DEPTH

 @vals=utils_split_uri($START);
 $ST[1]=$vals[0]; 	# uri
 $ST[0]=$vals[2]; 	# host
 $ST[5]=$vals[3]; 	# port
 $ST[4]=undef;		# server tag

 return if($ST[0] eq '');

 # some various informationz...
 $LW::crawl_config{'host'}=$ST[0];
 $LW::crawl_config{'port'}=$ST[5];
 $LW::crawl_config{'start'}=$ST[1];

 $$hrin{'whisker'}->{'host'}=$ST[0];
 $$hrin{'whisker'}->{'port'}=$ST[5];
 $$hrin{'whisker'}->{'lowercase_incoming_headers'}=1; # makes life easier

 http_fixup_request($hrin);

 # this is so callbacks can access internals via references
 $LW::crawl_config{'ref_links'}=\@links;
 $LW::crawl_config{'ref_jar'}=\%jar;
 $LW::crawl_config{'ref_hin'}=$hrin;
 $LW::crawl_config{'ref_hout'}=\%hout;

 %LW::crawl_referrers=(); # empty out existing referrers
 %LW::crawl_server_tags=();
 %LW::crawl_offsites=();
 %LW::crawl_cookies=();
 %LW::crawl_forms=();

 push @links, \@{[$ST[1],1,($vals[1] eq 'https')?1:0]};

 while(@links){
  my $C=shift @links;
  $ST[1]=$C->[0]; # url
  $ST[6]=$C->[1]; # depth
  $ST[3]=$C->[2]; # https

  next if(defined $$hrtrack{$ST[1]} && $$hrtrack{$ST[1]} ne '?');

  if($ST[6] > $MAX_DEPTH){
	$$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0);
	next;
  }

  $ST[2]=utils_get_dir($ST[1]);

  $$hrin{'whisker'}->{'uri'}=$ST[1];
  $$hrin{'whisker'}->{'ssl'}=$ST[3];
  my $result = crawl_do_request($hrin,\%hout);
  if($result==1 || $result==2){
	push @ERRORS, "Error on making request for '$ST[1]': $hout{'whisker'}->{'error'}";
	next;
  }

  if($result==0 || $result==4){
	$$hrtrack{$ST[1]}=$hout{'whisker'}->{'http_resp'}; }
  
  if($result==3 || $result==5){
	$$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0); }

  if(defined $hout{'server'}){ 
   if(!defined $ST[4]){ # server tag
	$ST[4]=$hout{'server'}; }
   $LW::crawl_server_tags{$hout{'server'}}++;
  }

  if(defined $hout{'set-cookie'}){
		if($LW::crawl_config{'save_cookies'}>0){
			if(ref($hout{'set-cookie'})){
				foreach (@{$hout{'set-cookie'}}){
					$LW::crawl_cookies{$_}++; }
			} else {
				$LW::crawl_cookies{$hout{'set-cookie'}}++; 
		}	}

		if($LW::crawl_config{'reuse_cookies'}>0){
			cookie_read(\%jar,\%hout); }
  }


  next if($result==4 || $result==5);  
  next if(scalar @links > $LW::crawl_config{'url_limit'});

  if($result==0){ # page should be parsed
	if($LW::crawl_config{'source_callback'} != 0  &&
		ref($LW::crawl_config{'source_callback'})){
		&{$LW::crawl_config{'source_callback'}}($hrin,\%hout); }

	LW::html_find_tags(\$hout{'whisker'}->{'data'},
		\&crawl_extract_links_test);
	$LW::crawl_config{'stats_html'}++; # count how many pages we've parsed
  }

  if($result==3){ # follow the move via location header
	push @LW::crawl_urls, $hout{'location'}; }

  foreach $T (@LW::crawl_urls){
	 $T=~tr/\0\r\n//d; # the NULL character is a bug that's somewhere
	 next if (length($T)==0);
	 next if ($T=~/^javascript:/i); # stupid javascript
	 next if ($T=~/^mailto:/i);
	 next if ($T=~m#^([a-zA-Z]*)://# && lc($1) ne 'http' && lc($1) ne 'https');
	 next if ($T=~/^#/i); # fragment

	 if($LW::crawl_config{'callback'} != 0){
		next if &{$LW::crawl_config{'callback'}}($T,@ST); }

	 push(@{$LW::crawl_referrers{$T}}, $ST[1]) 
		if( $LW::crawl_config{'save_referrers'}>0 );

	 $T=utils_absolute_uri($T,$ST[1],1) if($LW::crawl_config{'normalize_uri'}>0);
	 @vals=utils_split_uri($T);

	 # slashdot bug: workaround for the following fsck'd html code:
	 # <FORM ACTION="//slashdot.org/users.pl" METHOD="GET">
	 if($LW::crawl_config{'slashdot_bug'} > 0 && 
			substr($vals[0],0,2) eq '//'){
		if($ST[3]==1){	$T='https:'.$T;
		} else {	$T='http:' .$T; }
		@vals=utils_split_uri($T);
	 }

	 # make sure URL is on same host, port, and protocol
	 if( (defined $vals[2] && $vals[2] ne $ST[0]) || 
			(defined $vals[3] && $vals[3] != $ST[5]) ||
			(defined $vals[1] && ($vals[1] ne 'http' 
				&& $vals[1] ne 'https'))){
		if($LW::crawl_config{'save_offsites'}>0){
			$LW::crawl_offsites{utils_join_uri(@vals)}++; }
		next; }

	 if(substr($vals[0],0,1) ne '/'){
		$vals[0]=$ST[2].$vals[0]; }

	 my $where=rindex($vals[0],'.');
	 my $EXT='';
	 if($where >= 0){
	   $EXT = substr($vals[0], $where+1, length($vals[0])-$where); }

	 $EXT=~tr/0-9a-zA-Z//cd; # yucky chars will puke regex below

	 if($EXT ne '' && $LW::crawl_config{'skip_ext'}=~/\.$EXT /i){
		if($LW::crawl_config{'save_skipped'}>0){
			$$hrtrack{$vals[0]}='?'; }
	 	next; }

	 if(defined $vals[4] && $LW::crawl_config{'use_params'}>0){
		if($LW::crawl_config{'params_double_record'}>0 &&
				!defined $$hrtrack{$vals[0]}){
			$$hrtrack{$vals[0]}='?'; }
		$vals[0]=$vals[0].'?'.$vals[4];	
	 }

	 next if(defined $$hrtrack{$vals[0]});

	 push @links, \@{[$vals[0],$ST[6]+1, ($vals[1] eq 'https')?1:0]};

  } # foreach

  @LW::crawl_urls=(); # reset for next round
 } # while

 my $key;
 foreach $key (keys %LW::crawl_config){
 	delete $LW::crawl_config{$key} if (substr($key,0,4) eq 'ref_');}

 $LW::crawl_config{'stats_reqs'}=$hout{'whisker'}->{'stats_reqs'};
 $LW::crawl_config{'stats_syns'}=$hout{'whisker'}->{'stats_syns'};

} # end sub crawl

#####################################################

=pod

=head1 - Function: LW::crawl_get_config
  
Params: $config_directive
Return: $config_directive_value

Returns the set value of the submitted config_directive.  See
LW::crawl_set_config() for a list of configuration values.

=cut

sub crawl_get_config {
	my $key=shift;
	return $LW::crawl_config{$key};
}

#####################################################

=pod

=head1 - Function: LW::crawl_set_config
  
Params: $config_directive, $value
Return: Nothing

This function adjusts the configuration of the crawl package. Use values
of 0 and 1 for off and on, respectively.  The defaults are set in 
libs/globals.wpl.

save_cookies
- crawl will save all cookies encountered, for later review

save_offsite_urls
- crawl will save all offsite URLs (URLs not on this host); crawl
  will not actually crawl those hosts (use separate calls to crawl)

follow_moves
- crawl will follow the URL received from an HTTP move response

use_params
- crawl will factor in URI parameters when considering if a URI is unique 
  or not

params_double_record
- if both use_params and params_double_record are set, crawl will make two
  entries for each URI which has paramaters: one with and one without the
  parameters

reuse_cookies
- crawl will resubmit any received/prior cookies

skip_ext
- crawl will ignore requests for URLs ending in extensions given; the 
  value requires a specific string format: (dot)extension(space).  For
  example, to ignore GIFs and JPGs, you would run:
 	LW::crawl_set_config('skip_ext',".gif .jpg ");

save_skipped
- any URLs that are skipped via skip_ext, or are above the specified DEPTH 
  will be recorded in the tracking hash with a value of '?' (instead of an
  HTTP response code).

callback
- crawl will call this function (if this is a reference to a function), 
  passing it the current URI and the @ST array (which has host, port, SSL, 
  etc info).  If the function returns a TRUE value, then crawl will skip
  that URI.  Set to value 0 (zero) if you do not want to use a callback.

slashdot_bug
- slashdot.org uses a screwy piece of invalid (yet it works) HTML in
  the form of <FORM ACTION="//slashdot.org/somefile">.  So basically,
  when a URL starts with '//' and slashdot_bug is set to 1 (which it
  is by default), then the proper 'http:' or 'https:' will be prepended
  to the URL.

source_callback
- crawl will call this function (if this is a reference to a function), 
  passing references to %hin and %hout, right before it parses the page
  for HTML links.  This allows the callback function to review or
  modify the HTML before it's parsed for links.  Return value is ignored.
  
url_limit
- number or URLs that crawl will queue up at one time; defaults to 1000

do_head
- use head requests to determine if a file has a content-type worth
  downloading.  Potentially saves some time, assuming the server properly
  supports HEAD requests.  Set to value 1 to use (0/off by default).


=cut

sub crawl_set_config {
	return if(!defined $_[0]);
	my %opts=@_;
	while( my($k,$v)=each %opts){
		$LW::crawl_config{lc($k)}=$v; }
}

#####################################################

=pod

=head1 - Function: LW::crawl_extract_links_test (INTERNAL)
  
Params: $TAG, \%elements, \$html_data, $offset, $len
Return: nothing

This is the callback function used by the crawl function, and passed to 
html_find_tags.  It will find URL/URI links and place them in 
@LW::crawl_urls.

=cut

sub crawl_extract_links_test {
	my ($TAG, $hr, $dr, $start, $len)=(lc(shift),@_);
	my $t;

	# this should be most of the time...
	return undef if(!defined ($t=$LW::crawl_linktags{$TAG}));
	return undef if(!scalar %$hr); # fastpath quickie

	while( my ($key,$val)= each %$hr){ # normalize element values
		$$hr{lc($key)} = $val;
	}

	if(ref($t)){
		foreach (@$t){
			push(@LW::crawl_urls,$$hr{$_}) if(defined $$hr{$_});
		}
	} else {
		push(@LW::crawl_urls,$$hr{$t}) if(defined $$hr{$t});
	}

	if($TAG eq 'form' && defined $$hr{action}){
		my $u=$LW::crawl_config{'ref_hout'}->{'whisker'}->{'uri'};
		$LW::crawl_forms{utils_absolute_uri($$hr{action},$u,1)}++;
	}

	return undef;
}

################################################################

=pod

=head1 - Function: LW::crawl_make_request (INTERNAL)
  
Params: \%hin, \%hout
Return: $status_code

This is an internal function used by LW::crawl(), and is responsible for
making HTTP requests, including any HEAD pre-requests and following move
responses.  Status codes are:
	0	Success
	1	Error during request
	2	Error on connection setup
	3	Move request; follow Location header
	4	File not of text/htm(l) type
	5	File not available

=cut

sub crawl_do_request {
 my ($hrin,$hrout) = @_;
 my $ret;

 if($LW::crawl_config{'do_head'}){  
	my $save=$$hrin{'whisker'}->{'method'};
	$$hrin{'whisker'}->{'method'}='HEAD';
	$ret=http_do_request($hrin,$hrout);
	$$hrin{'whisker'}->{'method'}=$save;

	return 2 if($ret==2); # if there was connection error, do not continue
	if($ret==0){ # successful request
	    	if($$hrout{'whisker'}->{'http_resp'}==501){ # HEAD not allowed
    			$LW::crawl_config{'do_head'}=0; # no more HEAD requests
	    	}

		if($$hrout{'whisker'}->{'http_resp'} <308 &&
				$$hrout{'whisker'}->{'http_resp'} >300){
			if($LW::crawl_config{'follow_moves'} >0){
				return 3 if(defined $$hrout{'location'}); }
			return 5; # not avail
		}

		if($$hrout{'whisker'}->{'http_resp'}==200){
			# no content-type is treated as text/htm
			if(defined $$hrout{'content-type'} &&
					$$hrout{'content-type'}!~/^text\/htm/i){
				return 4;
			}		
			# fall through to GET request below			
		}
    	}
	# request errors are essentially redone via GET, below
  }

 return http_do_request($hrin,$hrout);
}

#####################################################
