#!/usr/local/bin/perl # # geturl10.pl-- Retrieve any HTTP URL, and save to a local file. # # (C) 1997 James Marshall (james@jmarshall.com) # # OVERALL OPERATION: # First, if a filename is given on the command line, open it for # writing and select() it. This way, the rest of the program can simply # print with no filehandle, and the output will be correctly routed either # to STDOUT or to the local file. # After parsing the hostname, port number, and path from the URL, # the program opens a socket to the HTTP server with &newsocketto(). # It sends a simple HTTP 1.0 GET request, and waits for the socket # response with select(). # To distinguish between an HTTP 0.9 response (which has no headers) # and a normal HTTP 1.x response, the program reads the first five bytes # of the response. If they are not "HTTP/", then HTTP 0.9 is assumed, # and the entire socket output is copied to STDOUT (or the local file). # HTTP 0.9 is very rare. # If the response is HTTP 1.x, then the remainder of the status line # and headers are read. If the status code is 301, 302, or 303, the # socket is closed and the program essentially restarts, aiming this time # for the URL given in the "Location:" response header. This program # redirects a request no more than five times, to avoid a potential # infinite loop. # Otherwise, if the status code is anything other than 200 (success), # this program dies. # So now we have a 200 response for the URL we requested; the # remaining task is to copy the correct number of bytes from the socket # to the output. If the "Content-Length:" response header is present, # copy only that many bytes. Otherwise, copy over all socket output. # The program then closes the socket and any output file, and exits. # # NOTE: # Some parsing in this program isn't perfect, but will almost always # work. Specifically, in its quest to be an understandable demo, this # program doesn't strictly follow the BNF's for things like header field # contents and URL's. If you plan to write commercial-quality software, # use more complete regular expressions. See the BNF's in the HTTP spec, # RFC 822, and RFC 2396 (URL/URI syntax). # # For platform-independence, this program uses \015\012 instead of \r\n # for the CRLF sequence. # # To use this script with Perl 4: # 1. Remove "use Socket" and see note in &newsocketto(). # 2. Add "$*= 1"; remove "m" flag from location/content-length matches. # use Socket ; # Perl 5 only # Uncomment this for Perl 4 # $*= 1 ; $/= "\012" ; # default for Unix, but Macs need it set explicitly # Read the URL and optional filename from the command line (($URL,$fname)= @ARGV) || &usage ; &usage if $URL=~ /^-/ ; # Open and select the local file, if given on the command line if ($fname) { open(SAVEFILE, ">$fname") || die "Couldn't open $fname for writing: $!" ; select(SAVEFILE) ; } # Basically, the whole program. # Put this in a block, to restart on 300-level responses. GETURL: { # Only support HTTP URLs; scheme defaults to HTTP $URL=~ m#^([\w+.-]+)://(.*)# && ( ($scheme,$URL)= ($1,$2) ) ; die "Sorry, $0 only supports HTTP URLs.\n" unless $scheme=~ /^(http)?$/i ; # parse the URL, simply (doesn't do much error-checking) ($host, $port, $path)= ($URL=~ m#([^/:]*):?([^/]*)(/.*)?$#i) ; $port= ($port || 80) ; $path= ($path || "/") ; # Open socket to host ($success, $errmsg)= &newsocketto(*S, $host, $port) ; die $errmsg unless $success ; # Send request, including User-Agent: header for Net politeness print S "GET $path HTTP/1.0\015\012", "User-Agent: GetURL10/1.0\015\012\015\012" ; # Wait for socket response with select() vec($rin= '', fileno(S), 1)= 1 ; select($rin, undef, undef, 60) || die "No response from $host:$port: $!" ; # Read first five chars, to determine if is HTTP 0.9 $numread= 0 ; while ( ($numread<5) && ($thisread= read(S, $status_line, 5-$numread, $numread)) ) { $numread+= $thisread ; } defined($thisread) || die "Couldn't read response: $!" ; # handle the rare HTTP 0.9 response (which has no header data) if ($status_line!~ m#^HTTP/#) { print $status_line ; print while read(S, $_, 16384) ; # handle HTTP 1.x response } else { # finish reading the status line # $/= "\012" ; $status_line.= ; ($status_code)= ($status_line=~ m#^HTTP/\d+\.\d+\s+(\d+)#) ; # read $headers $headers= '' ; while () { last if /^\015?\012/ ; # end on LF or CRLF $headers.= $_ ; } $headers=~ s/\015?\012[ \t]+/ /g ; # unfold multi-line headers # Redirect 301, 302, 303 responses, but avoid infinite redirection loop if ($status_code=~ /^(301|302|303)$/) { unless ( ($URL)= ($headers=~ /^location:[ \t]*(\S*)/im) ) { die "No Location: header in $status_code response; " . "headers are:\n$status_line$headers\n\n" ; } ($numredirects++ > 5) && die "Redirected more than five times, quitting" ; print STDERR "Redirecting to $URL\n" ; close(S) ; redo GETURL ; } # If not 200 response, then die ($status_code == 200) || die "Got $status_code response; headers are:\n" . "$status_line$headers\n\n" ; # If there is Content-Length: header, only copy that many bytes # to STDOUT. if ( ($content_length)= ($headers=~ /^content-length:[ \t]*(\d*)/im) ) { $lefttoget= $content_length ; while ($lefttoget && ($thisread= read(S, $buf, &min($lefttoget,16384)) )) { print $buf ; $lefttoget-= $thisread ; } defined($thisread) || die "Couldn't read response body: $!" ; # Die if we didn't get all the bytes we were expecting $lefttoget && die "Not all data was read; expected $content_length, got " . ($content_length-$lefttoget) . "." ; # No Content-Length: header, so copy entire socket output to STDOUT } else { print while read(S, $_, 16384) ; } } close(S) ; } # GETURL # Close the local file if needed close(SAVEFILE) if $fname ; exit ; #--------- newsocketto, usage, min ----------------------------------- # Open a socket to a given host and port. # Returns TRUE, with open socket in S, or returns (FALSE, $error_message). # NOTE IF USING PERL 4: Remove "use Socket" from the beginning of the # script, and add the following line, setting your values as necessary: # $AF_INET= 2 ; $SOCK_STREAM= 1 ; # Usually in /usr/include/sys/socket.h # Then, change AF_INET to $AF_INET and SOCK_STREAM to $SOCK_STREAM below. sub newsocketto { local(*S, $host, $port)= @_ ; local($hostaddr, $remotehost) ; # Create the remote host data structure, from host name or IP address ($hostaddr= ($host=~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) ? pack('C4', $1, $2, $3, $4) # for IP address : (gethostbyname($host))[4] ) # for alpha host name || return(0, "Couldn't find IP address for $host") ; $remotehost= pack('S n a4 x8', AF_INET, $port, $hostaddr) ; # Create the socket and connect to the remote host socket(S, AF_INET, SOCK_STREAM, (getprotobyname('tcp'))[2]) || return(0, "Couldn't create socket: $!") ; connect(S, $remotehost) || return(0, "Couldn't connect to $host:$port: $!") ; select((select(S), $|=1)[0]) ; # unbuffer the socket return (1, "") ; # success! } # Explain usage sub usage { die <