#!/usr/local/bin/perl
# 
#   geturl10.pl-- Retrieve any HTTP URL, and save to a local file.
#
#   (C) 1997 James Marshall (james@jmarshall.com)
#
#   OVERALL OPERATION:
#       First, if a filename is given on the command line, open it for 
#   writing and select() it.  This way, the rest of the program can simply 
#   print with no filehandle, and the output will be correctly routed either 
#   to STDOUT or to the local file.
#       After parsing the hostname, port number, and path from the URL,
#   the program opens a socket to the HTTP server with &newsocketto().
#   It sends a simple HTTP 1.0 GET request, and waits for the socket 
#   response with select().
#       To distinguish between an HTTP 0.9 response (which has no headers)
#   and a normal HTTP 1.x response, the program reads the first five bytes 
#   of the response.  If they are not "HTTP/", then HTTP 0.9 is assumed, 
#   and the entire socket output is copied to STDOUT (or the local file).
#   HTTP 0.9 is very rare.
#       If the response is HTTP 1.x, then the remainder of the status line
#   and headers are read.  If the status code is 301, 302, or 303, the
#   socket is closed and the program essentially restarts, aiming this time
#   for the URL given in the "Location:" response header.  This program
#   redirects a request no more than five times, to avoid a potential 
#   infinite loop.
#       Otherwise, if the status code is anything other than 200 (success), 
#   this program dies.
#       So now we have a 200 response for the URL we requested; the 
#   remaining task is to copy the correct number of bytes from the socket 
#   to the output.  If the "Content-Length:" response header is present, 
#   copy only that many bytes.  Otherwise, copy over all socket output.
#       The program then closes the socket and any output file, and exits.
#
#   NOTE:
#       Some parsing in this program isn't perfect, but will almost always 
#   work.  Specifically, in its quest to be an understandable demo, this
#   program doesn't strictly follow the BNF's for things like header field
#   contents and URL's.  If you plan to write commercial-quality software, 
#   use more complete regular expressions.  See the BNF's in the HTTP spec, 
#   RFC 822, and RFC 2396 (URL/URI syntax).
#
#   For platform-independence, this program uses \015\012 instead of \r\n
#   for the CRLF sequence.
#
#   To use this script with Perl 4:
#       1. Remove "use Socket" and see note in &newsocketto().
#       2. Add "$*= 1"; remove "m" flag from location/content-length matches.
#

use Socket ;    # Perl 5 only

# Uncomment this for Perl 4
# $*= 1 ;

$/= "\012" ;    # default for Unix, but Macs need it set explicitly


# Read the URL and optional filename from the command line
(($URL,$fname)= @ARGV) || &usage ;
&usage if $URL=~ /^-/ ;

# Open and select the local file, if given on the command line
if ($fname) {
    open(SAVEFILE, ">$fname") || die "Couldn't open $fname for writing: $!" ;
    select(SAVEFILE) ;
}



# Basically, the whole program.
# Put this in a block, to restart on 300-level responses.
GETURL: {

    # Only support HTTP URLs; scheme defaults to HTTP
    $URL=~ m#^([\w+.-]+)://(.*)# && ( ($scheme,$URL)= ($1,$2) ) ;
    die "Sorry, $0 only supports HTTP URLs.\n" unless $scheme=~ /^(http)?$/i ;

    # parse the URL, simply (doesn't do much error-checking)
    ($host, $port, $path)= ($URL=~ m#([^/:]*):?([^/]*)(/.*)?$#i) ;
    $port= ($port || 80) ;
    $path= ($path || "/") ;


    # Open socket to host
    ($success, $errmsg)= &newsocketto(*S, $host, $port) ;
    die $errmsg unless $success ;

    # Send request, including User-Agent: header for Net politeness
    print S "GET $path HTTP/1.0\015\012",
            "User-Agent: GetURL10/1.0\015\012\015\012" ;

    # Wait for socket response with select()
    vec($rin= '', fileno(S), 1)= 1 ;
    select($rin, undef, undef, 60) || die "No response from $host:$port: $!" ;


    # Read first five chars, to determine if is HTTP 0.9
    $numread= 0 ;
    while ( ($numread<5) 
            && ($thisread= read(S, $status_line, 5-$numread, $numread)) ) {
        $numread+= $thisread ;
    }
    defined($thisread) || die "Couldn't read response: $!" ;

    # handle the rare HTTP 0.9 response (which has no header data)
    if ($status_line!~ m#^HTTP/#) {
        print $status_line ;
        print while read(S, $_, 16384) ;

    # handle HTTP 1.x response
    } else {
        # finish reading the status line
        # $/= "\012" ;
        $status_line.= <S> ;
        ($status_code)= ($status_line=~ m#^HTTP/\d+\.\d+\s+(\d+)#) ;

        # read $headers
        $headers= '' ;
        while (<S>) {
            last if /^\015?\012/ ;  # end on LF or CRLF
            $headers.= $_ ;
        }
        $headers=~ s/\015?\012[ \t]+/ /g ;     # unfold multi-line headers

        # Redirect 301, 302, 303 responses, but avoid infinite redirection loop
        if ($status_code=~ /^(301|302|303)$/) {
            unless ( ($URL)= ($headers=~ /^location:[ \t]*(\S*)/im) ) {
                die "No Location: header in $status_code response; "
                  . "headers are:\n$status_line$headers\n\n" ;
            }
            ($numredirects++ > 5) 
                && die "Redirected more than five times, quitting" ;
            print STDERR "Redirecting to $URL\n" ;
            close(S) ;
            redo GETURL ;
        }


        # If not 200 response, then die
        ($status_code == 200)
            || die "Got $status_code response; headers are:\n"
                 . "$status_line$headers\n\n" ;


        # If there is Content-Length: header, only copy that many bytes 
        #   to STDOUT.
        if ( ($content_length)= 
                ($headers=~ /^content-length:[ \t]*(\d*)/im) ) {
            $lefttoget= $content_length ;
            while ($lefttoget 
                    && ($thisread= read(S, $buf, &min($lefttoget,16384)) )) {
                print $buf ;
                $lefttoget-= $thisread ;
            }
            defined($thisread) || die "Couldn't read response body: $!" ;

            # Die if we didn't get all the bytes we were expecting
            $lefttoget
                && die "Not all data was read; expected $content_length, got "
                        . ($content_length-$lefttoget) . "." ;

        # No Content-Length: header, so copy entire socket output to STDOUT
        } else {
            print while read(S, $_, 16384) ;
        }

    }
    
    close(S) ;


} # GETURL


# Close the local file if needed
close(SAVEFILE) if $fname ;


exit ;


#--------- newsocketto, usage, min -----------------------------------


# Open a socket to a given host and port.  
# Returns TRUE, with open socket in S, or returns (FALSE, $error_message).
# NOTE IF USING PERL 4: Remove "use Socket" from the beginning of the 
#   script, and add the following line, setting your values as necessary:
#     $AF_INET= 2 ; $SOCK_STREAM= 1 ;  # Usually in /usr/include/sys/socket.h
#   Then, change AF_INET to $AF_INET and SOCK_STREAM to $SOCK_STREAM below.
sub newsocketto {
    local(*S, $host, $port)= @_ ;
    local($hostaddr, $remotehost) ;

    # Create the remote host data structure, from host name or IP address
    ($hostaddr= ($host=~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/)
                  ?  pack('C4', $1, $2, $3, $4)     # for IP address
                  :  (gethostbyname($host))[4] )    # for alpha host name
        || return(0, "Couldn't find IP address for $host") ;

    $remotehost= pack('S n a4 x8', AF_INET, $port, $hostaddr) ;

    # Create the socket and connect to the remote host
    socket(S, AF_INET, SOCK_STREAM, (getprotobyname('tcp'))[2])
        || return(0, "Couldn't create socket: $!") ;
    connect(S, $remotehost) 
        || return(0, "Couldn't connect to $host:$port: $!") ;

    select((select(S), $|=1)[0]) ;      # unbuffer the socket
    return (1, "") ;      # success!
}


# Explain usage
sub usage {
    die <<EOF ;
To download an HTTP URL to stdout, or to a local file, use
    $0 URL [filename]
EOF
}


# Return the minimum of a list of values
sub min {
    local($min)= $_[0]+0 ;  # force to numeric
    foreach (@_) {
        $min= $_ if $_<$min ;
    }
    return $min ;
}

