#!/usr/local/bin/perl -w =head1 NAME timefetch - Fetch a web page and time how long it takes =head1 SCRIPT CATEGORIES Web =head1 SYNOPSIS C C =head1 README This script uses LWP and Time::HiRes to fetch a web page, parse embedded IMG tags and other tags, download the images, and time the whole download operation. It is useful for measuring download times for an entire page with images, and for reporting errors in downloads (timeouts, connect failures, broken images, etc.). Complete documentation is available in the script iteself, in the embedded POD. =head1 DESCRIPTION This script uses LWP and Time::HiRes to fetch a web page, parse embedded IMG tags and other tags, download the images, and time the whole download operation. It is useful for measuring download times for an entire page with images, and for reporting errors in downloads (timeouts, connect failures, broken images, etc.). Depending on the command line options you use, it can just download a simgle HTML page or recursively download all embedded images and applets, and tally a grand total download time and total download size. This script does not implement a smart browser. It basically does two kinds of "benchmarks": best-case scenario and absolute worst-case scenario. The best-case scenario is when a browser downloads just the contents of a single HTML page, and does not download any images, because they are in the cache. The worst-case scenario is when the browser must download each and every image, one after the other. Neither one of these scenarios exactly matches real-live scenarios you experience with a browser. However, these numbers are an objective measurement of a repeatible and well-defined process. You can use them for straightforward comparison of the download times of two different web pages, or chart the download time of a single page during different kinds of loads and traffic patterns. While doing the downloading, the script checks for certain kinds of errors. Depending on the exact command options, the program may abort with a brief message explaining the error and with an exit value of -1. This makes the script useful as a monitor script for a web page. The script can detect the following kinds of errors: =over 4 =item * Download of the main web page times out. =item * The number of broken images exceeds a configurable number. =item * The main web page is not complete (e.g., it's missing ""). =item * The total size of the web page including graphics is too small. =back It will also report any errors returned by LWP (connection refused, connection reset by peer, etc.). You can use the "-X" option to prevent the script from aborting on errors. One extra feature in this script is the "force-host" feature. You can force this script to munge each URL before fetching it. This is useful in a situation where the URLs in your HTML pages all contain a load-balanced hostname, but you want to target a specific member host, or compare two member hosts. This script is NOT a spider. It will only fetch the inline images and applets contained in a single web page. However, it will download component frames in a frameset down to arbitrary levels. =head1 EXAMPLES C C C =head1 CHANGES Version 1.02 Fixed an incompatibility with version 3.xx of HTML::Parser. Added option -F to prevent downloading of frames. Added option -z to handle HTTP errors on the main page differently from errors on other URLs. There were several rather stupid bugs in versions 0.92 and 1.01, these have been fixed. =head1 PREREQUISITES This script requires C and C. As written, it requires Perl 5.004, but in a pinch you could modify it to run on earlier versions of Perl 5. =head1 COPYRIGHT Copyright (c) 1998 John Nolan . All rights reserved. This program is free software. You may modify and/or distribute it under the same terms as Perl itself. This copyright notice must remain attached to the file. =head1 REVISION $Id: timefetch,v 1.6 2000/01/08 19:27:23 john Exp john $ =cut # --------------------------------------------------------------------------- $|++; # Turn on autoflush use 5.004; # We will use "for my" # --------------------------------------------------------------------------- # Load modules # use Time::HiRes; use HTTP::Request; use LWP::UserAgent; use HTML::LinkExtor; use URI::URL; use Getopt::Std; use strict; # --------------------------------------------------------------------------- # Define global variables and objects up front # my ($Ua, $Pa, @Urls, $TotalTime, $TotalSize, $RealUrl, $ThisUrl); my ($Debug, $Recursively, $Verbose, $Java, $ForceHost, $Frames); my ($Status, $ErrorMsg, $BrokenImages, $MinBrokenImages, $Timeout, $Attempts, $MinimumSize, $HTMLtext, $NoExitOnErrors, $ExitOnMainPageError); # This variable will remember whether we had to recurse # in order to download frames. It will contain a list of # the *types* of files which we downloaded additionally # (e.g., img, applet). # my $DidRecurse; # These will hold lists of urls, along with time and size data. # my %Cache; my @SummaryCache; # This will hold a list of URLs which we want to download. # We will add to this list the original set of URLs on the command line, # as well as when we parse links to frames. # We remove from this list when we actually download the frames. # my %FramesToDownload = (); # Initialize a few global variables. # $BrokenImages = 0; $Status = 0; # --------------------------------------------------------------------------- # Subroutines # --------------------------------------------------------------------------- # --------------------------------------------------------------------------- # sub Usage { return "Usage: $0 [dhFjrvXz] [-f host] [-a attempts] [-b broken_images]\n". "\t[-s size] [-T text] [-t timeout] http://url [http://url ... ]\n". "$0 -h for help message\n"; } # --------------------------------------------------------------------------- # sub PrintHelpMsg { print Usage(); print <"). Not case sensitive. -v Verbose: print out URLs as they are downloaded. -X Don't exit on errors, just try to continue. -z Exit immediately on errors in fetching the main page. NOTE: This program always downloads embedded frames and prints a cumulative total for frames and framesets, even if you did not specify a recursive download. EOM exit(0); } # --------------------------------------------------------------------------- # sub GetParams { getopts('a:b:df:Fhjs:t:T:rvXz') or die Usage(); use vars qw( $opt_a $opt_b $opt_d $opt_f $opt_F $opt_h $opt_j $opt_r $opt_s $opt_t $opt_T $opt_v $opt_X $opt_z ); $Attempts = ($opt_a ? $opt_a : 3 ); $MinBrokenImages = ($opt_b ? $opt_b : 3 ); $Debug = ($opt_d ? 1 : 0 ); $ForceHost = ($opt_f ? $opt_f : undef); $Java = ($opt_j ? 1 : 0 ); $Frames = ($opt_F ? 0 : 1 ); $Recursively = ($opt_r ? 1 : 0 ); $MinimumSize = ($opt_s ? $opt_s : 0 ); $HTMLtext = ($opt_T ? $opt_T : "" ); # e.g., "" $Timeout = ($opt_t ? $opt_t : 7 ); $Verbose = ($opt_v ? 1 : 0 ); $NoExitOnErrors = ($opt_X ? 1 : 0 ); $ExitOnMainPageError = ($opt_z ? 1 : 0 ); $Verbose = 1 if $Debug; $MinimumSize *= 1024; # Convert kbyes to bytes PrintHelpMsg() if $opt_h; # If there are no URL's on the command line, bolt. # die Usage() unless @ARGV; print "Forcing host $ForceHost .\n" if ($Verbose and $ForceHost); # Take URLs from the command line # return @ARGV; } # --------------------------------------------------------------------------- # Print routine for nicely-formatted output # sub PrintUrl { my $url = (shift or die "Internal routine PrintUrl() expects parameter \$url "); print sprintf ("%6.3f %5.1fkb: %s\n", $Cache{$url}->{TIME}, $Cache{$url}->{SIZE}/1024, $url); } # --------------------------------------------------------------------------- # Fetch a URL and time how long it takes # sub FetchUrl { my $url = (shift or die "Internal routine FetchUrl() expects parameter \$url "); print "Fetching: $url\n" if $Debug; my $req = HTTP::Request->new (GET => $url); # Temporarily turn off warnings. The bowels of LWP are returning # an obscure error here, which I don't know how to troubleshoot. # $^W = 0; my $start = Time::HiRes::gettimeofday(); my $res = $Ua->request($req); my $finish = Time::HiRes::gettimeofday(); $^W = 1; # Turn warnings back on again # Calculate stats my $size = length $res->content; my $time = ($finish - $start); return ($time,$size,$res); } # --------------------------------------------------------------------------- # Check the HTTP return status of a download, and return 0 # if it is not 200. If requested, also scan the fetched HTML # for the target string, and return 0 if we do not find it. # Otherwise return 1. # sub CheckResponse { # Get the response object as a parameter # my $res = (shift or die "Internal routine CheckResponse() expects parameter \$res "); my $check_html = (shift or ""); my $successful = 1; $successful = 0 unless $res->code() == 200; if ($check_html and $HTMLtext) { $successful = 0 unless $res->content =~ m#$HTMLtext#i; } return $successful; } # --------------------------------------------------------------------------- # This routine aborts on any errors, printing out the error message.... # unless the script was invoked with the -X option, in which case # it just prints the error message and returns. # sub ErrorHandler { $Status = -1; # Get the response object as a parameter, along with # any error message. # my $res = shift; my $url = shift; my $ErrorMsg = (shift or ""); unless ($ErrorMsg) { $ErrorMsg = "Error "; if ($res->code() != 200) { $ErrorMsg .= $res->status_line . ": ". $res->request->url . " \n"; } elsif ($res->content !~ m#$HTMLtext#i) { $ErrorMsg .= "broken document from " . $res->request->url . " \n"; } } print $ErrorMsg; exit($Status) unless $NoExitOnErrors; } # --------------------------------------------------------------------------- # HandleParsedLink # # This is a callback provided for handling HTML links found during parsing. # It is called once for each link found, with the $tag and %link # passed as parameters. $tag is the HTML tag where the link was found. # %links is a hash that contains the keyword/value pairs from the links # that contain URLs. # # For example, if an HTML anchor was found, the $tag would be "a" # and %links would be (href=>"url"). We check each URL in %links. # # For each URL, this function decides whether we want to download it. # It returns a list of URLs which need to be downloaded. # This function *only* checks documents for URLs which might # need downloading. # # Note that we are not EVER following hyperlinks like a spider. # We are only interested in images, applets and frames. # sub HandleParsedLink { my ($tag, %links) = @_; my ($time, $size, $res, $urlobj, $url); # Flag for deciding whether we want to download a given URL # my $we_want_this_url = 0; print "\$tag = $tag\n" if $Debug; $we_want_this_url = 1 if $Recursively and $tag =~ /img/; $we_want_this_url = 1 if $Java and $tag =~ /applet/; $we_want_this_url = 1 if $Frames and $tag =~ /frame/; return unless $we_want_this_url; # Examine the tag, and fetch its target if necessary. # for my $key (keys %links) { print "$tag: $key=$links{$key}\n" if $Debug; # Get the absolute URL # $urlobj = URI::URL->new( $links{$key}, $ThisUrl ); $url = $urlobj->abs; # Force a particular host by munging the actual URL # $url =~ s#http://[^/]+#http://$ForceHost# if $ForceHost; # Remove any in-page anchor tags. # $url =~ s/^([^#]*)#.*$/$1/; # Process each URL that we have not seen before # unless (exists $Cache{$url}) { # Keep a list each *type* of item # that we have recursively downloaded # $DidRecurse .= " $tag" unless $DidRecurse =~ /$tag/; # If the tag is a frame, then don't download it here, # let the caller take care of downloading it. Just add # the munged URL to the FramesToDownload hash and return. # if ($tag =~ /frame/) { $FramesToDownload{$url} = 1; return 1; } print "Examining url $url\n" if $Debug; ($time,$size,$res) = FetchUrl($url); $BrokenImages++ unless CheckResponse($res); ErrorHandler($res,$url) if ($BrokenImages >= $MinBrokenImages); $TotalSize += $size; $TotalTime += $time; $Cache{$url} = { TIME => $time, SIZE => $size }; PrintUrl($url) if ($Verbose); } } return 1; # Return a nice happy true value. } # --------------------------------------------------------------------------- # Process each URL in the list. # sub ExamineUrls { foreach (@_) { my $url = $_; my ($size,$time,$res); $TotalTime = 0; $TotalSize = 0; $RealUrl = ""; $ThisUrl = ""; $DidRecurse = ""; # Force a particular host by munging the actual URL # if ($ForceHost) { $url =~ s#http://[^/]+#http://$ForceHost#; } print "First fetch, for $url\n" if $Debug; # We make two fetches, no matter what. # First fetch. This fetch will traverse redirects, so it is added # to the total stats for the main url. We only care about the total # if we are doing a recursive download. As a side effect, after traversing # redirects, we learn the true URL of the page ($RealUrl). ATTEMPT: while ($Attempts-- > 0) { ($time,$size,$res) = FetchUrl($url); # Fetch the URL! if (CheckResponse($res, "check HTML content")) { last ATTEMPT; } else { ErrorHandler($res,$url); exit(-1) if $ExitOnMainPageError; } } if ($Recursively) { $TotalSize += $size; $TotalTime += $time; } # Now remember the URL we were redirected to. # NOTE this call to base() must come before we invoke # the parse function. Otherwise base() may croak. # $RealUrl = $res->base(); # Analyze the HTML we got back, and extract links. # The handler HandleParsedLink will download each # linked image, if we are doing a recursive download. # It will also download framesets, and parse them recursively. # # %FramesToDownload will hold a list of URLs which we want to download. # We may add to this list when we parse links to frames. # We remove from this list when we actually download the frames. # %FramesToDownload = (); # Empty the list # This call to parse might add items to the FramesToDownload hash, # which would add iterations to the URL loop. # $ThisUrl = $RealUrl; $Pa->parse($res->content); FRAME: while (scalar keys %FramesToDownload > 0) { my ($url) = each %FramesToDownload; delete $FramesToDownload{$url}; unless (exists $Cache{$url}) { my ($time,$size,$res) = FetchUrl($url); ErrorHandler($res,$url) unless CheckResponse($res, "check HTML content"); # Tally and store stats # $TotalSize += $size; $TotalTime += $time; $Cache{$url} = { TIME => $time, SIZE => $size }; $ThisUrl = $res->base(); $Pa->parse($res->content); } } # Second fetch. This fetch does not involve any redirects, # because we are hitting the real URL directly. The data # is not tallied into the total for the main URL. But we get # the data anyway, so we have it for just that URL alone. # We will need to report these values in every case. # # Note that we will *not* parse the contents again. # # Force a particular host by munging the actual URL # $RealUrl =~ s#http://[^/]+#http://$ForceHost# if $ForceHost; print "Second fetch, for RealUrl $RealUrl\n" if ($Debug); unless (exists $Cache{ $RealUrl }) { ($time,$size,$res) = FetchUrl( $RealUrl ); # Cache stats (but do not tally!) # ## $TotalSize += $size; ## $TotalTime += $time; $Cache{ $RealUrl } = { TIME => $time, SIZE => $size }; } push @SummaryCache, $RealUrl; if ($Recursively or $DidRecurse) { # Store the whole recursive fetch with a special URL string. # my $recursiveUrl = "$url (incl.:$DidRecurse)"; $Cache{$recursiveUrl} = { TIME => $TotalTime, SIZE => $TotalSize }; push @SummaryCache, $recursiveUrl; if ($TotalSize <= $MinimumSize) { ErrorHandler($res,$url, sprintf ( "Error: pagesize too small (%d kb) from %s\n", $TotalSize/1024,$url ) ); } } # end if ($Recursively or DidRecurse) } # end URL } # end sub ExamineUrls() # --------------------------------------------------------------------------- # sub PrintFinalResults { my $Separator = "----------------------------------------------------\n"; if ($Verbose or $Debug) { print $Separator; } else { print "OK\n" if $Status == 0; print "Error\n" if $Status == -1; } for my $url (@SummaryCache) { PrintUrl($url); } } # --------------------------------------------------------------------------- # MAIN LOGIC # --------------------------------------------------------------------------- # Get the list of target URLs # @Urls = GetParams(); # Set up a browser # $Ua = LWP::UserAgent->new; $Ua->agent("Mozilla (compatible: LWP $LWP::VERSION)"); $Ua->timeout($Timeout); # Set up a parser # $Pa = HTML::LinkExtor->new (\&HandleParsedLink); # Fetch the URLs # ExamineUrls( @Urls ); PrintFinalResults(); exit($Status); # --------------------------------------------------------------------------- # END # --------------------------------------------------------------------------- 1;