#!/usr/bin/perl # # CGIProxy 2.2.4 # # CGIProxy (nph-proxy.cgi): a proxy in the form of a CGI script. # Retrieves the resource at any HTTP or FTP URL, updating embedded URLs # in HTML and all other resources to point back through this script. By # default, no identifying user info is sent to the server. Options include # text-only proxying to save bandwidth, cookie filtering, ad filtering, # script removal, user-defined encoding of the target URL, and much more. # Besides running as a CGI script, can also run under mod_perl, as a # FastCGI script, or can use its own embedded HTTP server. # Requires Perl 5. # # Copyright (C) 1996, 1998-2018 by James Marshall, james@jmarshall.com # All rights reserved. Free for non-commercial use; commercial use # requires a license. # # For the latest, see https://jmarshall.com/tools/cgiproxy/ # # # IMPORTANT NOTE ABOUT ANONYMOUS BROWSING: # # CGIProxy was originally made for indirect browsing more than # anonymity, but since people are using it for anonymity, I've tried # to make it as anonymous as possible. Suggestions welcome. For best # anonymity, browse with JavaScript turned off. That said, please notify # me if you find any privacy holes, even when using JavaScript. # Anonymity is good, but may not be bulletproof. For example, if even # a single unchecked JavaScript statement can be run, your anonymity # can be compromised. I've tried to handle JS in every place it can # exist, but please tell me if I missed any. Also, browser plugins # or other executable extensions may be able to reveal you to a server. # Also, be aware that this script doesn't modify PDF files or other # third-party document formats that may contain linking ability, so # you will lose your anonymity if you follow links in such files. # If you find any other way your anonymity can be compromised, please let # me know. # # # INSTALLATION: # # For complete installation instructions, see # https://jmarshall.com/tools/cgiproxy/install.html # # To upgrade an existing installation of CGIProxy to the latest version, # see the UPGRADING section below. # # CGIProxy can run in any of four ways: as a CGI script, as a mod_perl # script, as a FastCGI script, or with its own embedded Web server. For # the first three, you'll need a working *secure* Web server first, one # configured to support the way you want to run CGIProxy. For the # embedded server, you'll need an SSL certificate / private key pair. # To get a free SSL certificate for your server, we recommend using the # Let's Encrypt service, at https://letsencrypt.org ; it's a certificate # authority (CA) that grants free SSL certificates, and the service is # run by people we trust (as of 2018). # # Starting in version 2.2.1, CGIProxy has an installation wizard that handles # most of what you need to install it. After copying nph-proxy.cgi to your # server, run "./nph-proxy.cgi install" (or in Windows, "perl nph-proxy.cgi install"). # This will: # - ask you some questions # - let you change any settings with a configuration menu # - create the directory structure under $PROXY_DIR (usually $HOME/cgiproxy/) # - create the configuration file $HOME/cgiproxy/cgiproxy.conf # - install all non-standard Perl (CPAN) modules required by CGIProxy # - create an empty database # - copy nph-proxy.cgi into place and modify it as needed # - add a cron job to purge the database every night, if needed # - add cron jobs to report usage every month, if needed # # When installing the Perl modules, the installation wizard will use the # system's package manager if running as root. Supported package # managers are dnf, yum, and apt-get. If none are found, or if not # running as root, the cpan utility will be used. # To *only* install the Perl modules, and not do any other installation # tasks, run "./nph-proxy.cgi install-modules". # If you run the installation wizard as root, the file permissions and # ownership of all directories and files will be set to be as secure # as possible. However, even if you can't install as root, this will # be done as well as possible and the script should still work. # # Also starting in version 2.2.1, you no longer need to edit this file to # configure it. Instead, use the simple configuration menu by running # "./nph-proxy.cgi config". Exception: If you need to change $PROXY_DIR, # then the menu won't do it-- you need to either set it in this file, in # the configuration section below, or run the "install" command (which # asks you for $PROXY_DIR and modifies this script to set it below). # # To see a simple usage message, run "./nph-proxy.cgi -?". # # # UPGRADING: # To upgrade to the latest version from version 2.2.3 or later, just run # "./nph-proxy.cgi upgrade". To run this automatically e.g. every month, # add a cron job to do it. # # To upgrade from version 2.2.2 or earlier, the easiest way is to first # uninstall the old version, and then install the new version. Using # this newer version of nph-proxy.cgi (the script file you're reading # right now), run these two commands: # ./nph-proxy.cgi uninstall # ./nph-proxy.cgi install # # NOTE: If you upgrade from version 2.2.2 or earlier and you want to # preserve your old configuration settings, copy cgiproxy.conf to # somewhere NOT under $PROXY_DIR/ , BEFORE you run the "uninstall" # command. (The "uninstall" command deletes the whole $PROXY_DIR # directory.) Then, when you run the "install" command, add the # parameter "--old-config my_old_cgiproxy.conf", with the correct # filename, like this: # ./nph-proxy.cgi install --old-config my_old_cgiproxy.conf # # # CONFIGURATION: # . After installing, use the configuration menu by running # "./nph-proxy.cgi config". You should never need to edit the # configuration in this script file. # . If you're using either a MySQL/MariaDB or Oracle database to store cookies, # you need to set $DB_DRIVER, $DB_USER, $DB_PASS, and possibly $DB_SERVER . # See the notes by those settings for more details. Note that you need to # purge the database periodically by running "./nph-proxy.cgi purge-db", # with a cron job on Unix or Mac, or with the Task Scheduler in Windows. # The default database driver is SQLite, which doesn't need a username or # password or even a running database engine, but still requires periodic # purging. On Unix and Mac, the installation wizard creates this cron job # automatically if needed. # . If you're willing to report simple usage counts to us (as set with # $REPORT_USAGE), you'll need a monthly cron job (or Task in Windows) # to run "./nph-proxy.cgi report-usage". In addition, you need another # cron job run at midnight on the first of every month to run # "./nph-proxy.cgi count-unique-users". On Unix and Mac, the installation # wizard creates these cron jobs automatically if needed. # . If you're using another HTTP or SSL proxy, set $HTTP_PROXY, # $SSL_PROXY, and $NO_PROXY as needed. If those proxies use # authentication, set $PROXY_AUTH and $SSL_PROXY_AUTH accordingly. # . If you're using a SOCKS proxy, set $SOCKS_PROXY and possibly # $SOCKS_USERNAME and $SOCKS_PASSWORD . # . If this is running on an insecure server that doesn't use port 80, set # $RUNNING_ON_SSL_SERVER=0 (otherwise, the default of '' is fine). # . If you plan to run CGIProxy as a FastCGI script, see the configuration # section "FastCGI configuration". # . If you plan to run CGIProxy using its own embedded server, see the # configuration section "Embedded server configuration". You'll need # a certificate and private key (key pair) in PEM format-- we recommend # using Let's Encrypt to get a free key pair, and their Certbot for a # tool to automatically manage it. # . See http://www.jmarshall.com/tools/cgiproxy/options.html#env , in the section # "OPTIONS RELATED TO YOUR SERVER/NETWORK ENVIRONMENT", for other options # you may need to set. # # Other options include: # . Set $TEXT_ONLY, $REMOVE_COOKIES, $REMOVE_SCRIPTS, $FILTER_ADS, # $HIDE_REFERER, and $INSERT_ENTRY_FORM as desired. Set # $REMOVE_SCRIPTS if anonymity is important. # . To let the user choose all of those settings (except $TEXT_ONLY), # set $ALLOW_USER_CONFIG=1. # . To change the encoding format of the URL, modify the # proxy_encode() and proxy_decode() routines. The default # routines are suitable for simple PATH_INFO compliance. # . To encode cookies, modify the cookie_encode() and cookie_decode() # routines. # . You can restrict which servers this proxy will access, with # @ALLOWED_SERVERS and @BANNED_SERVERS. # . Similarly, you can specify allowed and denied server lists for # both cookies and scripts. # . For security, you can ban access to private IP ranges, with # @BANNED_NETWORKS. # . If filtering ads, you can customize this with a few settings. # . To insert your own block of HTML into each page, set $INSERT_HTML # or $INSERT_FILE. # . As a last resort, if you really can't run this script as NPH, # you can try to run it as non-NPH by setting $NOT_RUNNING_AS_NPH=1. # BUT, read the notes and warnings above that line. Caveat surfor. # . For crude load-balancing among a set of proxies, set @PROXY_GROUP. # . Other config is possible; see the user configuration section. # . If heavy use of this proxy puts a load on your server, see the # "NOTES ON PERFORMANCE" section below. # # For more info, read the comments above any config options you set. # # For a full list of options, see https://jmarshall.com/tools/cgiproxy/options.html # # This script MUST be installed as a non-parsed header (NPH) script. # In Apache and many other servers, this is done by simply starting the # filename with "nph-". However, it can usually run as a non-NPH script # by using the $NOT_RUNNING_AS_NPH option. # # # TO USE: # Start a browsing session by visiting the script's URL with no parameters. # You can bookmark pages you browse to through the proxy, or link to # the URLs that are generated. # # # NOTES ON PERFORMANCE: # Remember that most of the delay experienced by the user is from waiting # on two network connections, which we have little control over. The # advice below only applies if your server CPU is getting overloaded. # Visiting sites with a lot of JavaScript and Flash are most likely to # overload your CPU, because modifying those is what takes most of the # time. # If you can, use mod_perl. Starting with version 1.3.1, this should # work under mod_perl, which requires Perl 5.004 or later. If you use # mod_perl, be careful to install this as an NPH script, i.e. set the # "PerlSendHeader Off" configuration directive (or "PerlOptions -ParseHeaders" # if using mod_perl 2.x). For more info, see the mod_perl documentation, # or https://jmarshall.com/tools/cgiproxy/install.html . # If you can't use mod_perl, try using FastCGI. You'll need to configure # your Web server to use FastCGI. For more info, see the FastCGI # documentation, or https://jmarshall.com/tools/cgiproxy/install.html . # If you can't use mod_perl or FastCGI, try running CGIProxy as its own # embedded server. You'll need a key pair (certificate and private key). # If you use mod_perl, FastCGI, or the embedded server, and modify this # script, see the note near the "reset 'a-z'" line below, regarding # UPPER_CASE and lower_case variable names. # # If performance on the browser is bad for JS-heavy sites like facebook, # then close other browser windows and other CPU-heavy processes, and # see the comments above the setting of %REDIRECTS below. Also, try # using a browser other than MSIE-- it seems to have the most problems. # # # TO DO: # What I want to hear about: # . Any HTML tags not being converted here. # . Any method of introducing JavaScript or other script, that's not # being handled here. # . Any script MIME types other than those already in @SCRIPT_MIME_TYPES. # . Any MIME types other than text/html that have links that need to # be converted. # plug any other script holes (e.g. MSIE-proprietary, other MIME types?) # more error checking? # find a simple encryption technique for proxy_encode() # For ad filtering, add option to disable images from servers other than # that of the containing HTML page? Is it worth it? # # # BUGS: # Anonymity may not not perfect. In particular, there may be some remaining # JavaScript or Flash holes. Please tell me if you find any. # # # I first wrote this in 1996 as an experiment to allow indirect browsing. # The original seed was a program I wrote for Rich Morin's article # in the June 1996 issue of Unix Review, online at # http://www.cfcl.com/tin/P/199606.shtml. # # Confession: I didn't originally write this with the spec for HTTP # proxies in mind, and there are probably some violations of the protocol # (at least for proxies). This whole thing is one big violation of the # proxy model anyway, so I hereby rationalize that the spec can be widely # interpreted here. If there is demand, I can make it more conformant. # The HTTP client and server components should be fine; it's just the # special requirements for proxies that may not be followed. # #-------------------------------------------------------------------------- use strict ; use warnings ; no warnings qw(uninitialized redefine) ; # we use defaults all the time use Cwd ; use Encode ; use IO::Handle ; use IO::Select ; use File::Path ; use File::Spec ; use Time::Local ; use Digest::SHA ; use Archive::Tar ; use Data::Dumper ; use Getopt::Long ; use Term::ReadLine ; use Socket qw(:all) ; use IO::Compress::Gzip ; use IO::Uncompress::Gunzip ; use Net::Domain qw(hostfqdn) ; use Fcntl qw(:DEFAULT :flock) ; use POSIX qw(:sys_wait_h setsid); use Time::HiRes qw(gettimeofday tv_interval) ; use Errno qw(EINTR EAGAIN EWOULDBLOCK ENOBUFS EPIPE) ; # Try to load any optional modules # Math::Random::Secure provides a cryptographically secure version of rand() . eval { require Math::Random::Secure ; Math::Random::Secure->import(qw(rand srand irand)) } ; # Need BEGIN{} block here to set @CONFIG_VARS in time for next "use vars" statement. use vars qw(@CONFIG_CATEGORIES %CONFIG_CATEGORIES @CONFIG_VARS @OBSOLETE_CONFIG_VARS) ; BEGIN { # Do it this way to keep order of categories @CONFIG_CATEGORIES=('server/network environment', 'FastCGI', 'embedded server', 'database', 'common options', 'page header', 'seldom-used') ; @CONFIG_CATEGORIES{@CONFIG_CATEGORIES}= ([qw($RUN_AS_USER $RUN_AS_GROUP $SECRET_PATH $LOCAL_LIB_DIR $RUNNING_ON_SSL_SERVER $NOT_RUNNING_AS_NPH $HTTP_PROXY $SSL_PROXY $NO_PROXY $PROXY_AUTH $SSL_PROXY_AUTH $SOCKS_PROXY $SOCKS_USERNAME $SOCKS_PASSWORD $USER_FACING_PORT $REPORT_USAGE)], [qw($FCGI_SOCKET $FCGI_MAX_REQUESTS_PER_PROCESS $FCGI_NUM_PROCESSES)], [qw($CERTIFICATE_FILE $PRIVATE_KEY_FILE $EMB_USERNAME $EMB_PASSWORD)], [qw($DB_DRIVER $DB_SERVER $DB_NAME $DB_USER $DB_PASS $USE_DB_FOR_COOKIES)], [qw($DEFAULT_LANG $TEXT_ONLY $REMOVE_COOKIES $REMOVE_SCRIPTS $FILTER_ADS $HIDE_REFERER $INSERT_ENTRY_FORM $ALLOW_USER_CONFIG @ALLOWED_SERVERS @BANNED_SERVERS @BANNED_NETWORKS @ALLOWED_COOKIE_SERVERS @BANNED_COOKIE_SERVERS @ALLOWED_SCRIPT_SERVERS @BANNED_SCRIPT_SERVERS @BANNED_IMAGE_URL_PATTERNS $RETURN_EMPTY_GIF $NO_COOKIE_WITH_IMAGE $QUIETLY_EXIT_PROXY_SESSION $PROXIFY_SCRIPTS $PROXIFY_SWF $ENCODE_URL_INPUT $USER_IP_ADDRESS_TEST $DESTINATION_SERVER_TEST %REDIRECTS)], [qw($INSERT_HTML $INSERT_FILE $ANONYMIZE_INSERTION $FORM_AFTER_INSERTION)], [qw(@PROXY_GROUP $SESSION_COOKIES_ONLY $MINIMIZE_CACHING $USER_AGENT @TRANSMIT_HTML_IN_PARTS_URLS $USE_PASSIVE_FTP_MODE $SHOW_FTP_WELCOME $PROXIFY_COMMENTS $USE_POST_ON_START $REMOVE_TITLES $NO_BROWSE_THROUGH_SELF $NO_LINK_TO_START $MAX_REQUEST_SIZE $ALLOW_UNPROXIFIED_SCRIPTS $COOKIE_PATH_FOLLOWS_SPEC $RESPECT_THREE_DOT_RULE $ALERT_ON_CSP_VIOLATION %TIMEOUT_MULTIPLIER_BY_HOST $ALLOW_RTMP_PROXY)] ) ; @OBSOLETE_CONFIG_VARS= qw($INSERTION_FRAME_HEIGHT) ; # Convert to a single flat array. @CONFIG_VARS= map { @{$CONFIG_CATEGORIES{$_}} } @CONFIG_CATEGORIES ; } # First block below is config variables that aren't in the config file, second # block is sort-of config variables, third block is persistent constants, # fourth block is would-be persistent constants (not set until needed), # fifth block is constants for JavaScript processing (mostly regular expressions), # and last block is variables. # Removed $RE_JS_STRING_LITERAL to help with Perl's long-literal-string bug, # but can replace it later if/when that is fixed. Added # $RE_JS_STRING_LITERAL_START, $RE_JS_STRING_REMAINDER_1, and # $RE_JS_STRING_REMAINDER_2 as part of the workaround. use vars @CONFIG_VARS, qw( $PROXY_VERSION $PROXY_ID $PROXY_DIR $OVERRIDE_SECURITY $ENCODE_DECODE_BLOCK_IN_JS @SCRIPT_MIME_TYPES @OTHER_TYPES_TO_REGISTER @TYPES_TO_HANDLE $NON_TEXT_EXTENSIONS @RTL_LANG $RUN_METHOD $RUN_AS_USER_ID $RUN_AS_GROUP_ID @MONTH @WEEKDAY %UN_MONTH @ALPHANUMERIC $MIME_TYPE_BY_EXT %RTL_LANG $DB_FULLPATH $DB_HOSTPORT $DBH $STH_UPD_COOKIE $STH_INS_COOKIE $STH_SEL_COOKIE $STH_SEL_ALL_COOKIES $STH_DEL_COOKIE $STH_DEL_ALL_COOKIES $STH_UPD_SESSION $STH_INS_SESSION $STH_SEL_IP $STH_PURGE_SESSIONS $STH_PURGE_SESSIONS_USAGE $STH_PURGE_COOKIES $USER_IP_ADDRESS_TEST_H $DESTINATION_SERVER_TEST_H $RUNNING_ON_IIS @NO_PROXY $NO_CACHE_HEADERS @ALL_TYPES %MIME_TYPE_ID $SCRIPT_TYPE_REGEX $TYPES_TO_HANDLE_REGEX $THIS_HOST $ENV_SERVER_PORT $ENV_SCRIPT_NAME $THIS_SCRIPT_URL $SSL_SUPPORTED $RTMP_SERVER_PORT %ENV_UNCHANGING $HAS_SET_CONSTANTS $TERM_RL %MSG @MSG_KEYS $CUSTOM_INSERTION %IN_CUSTOM_INSERTION %OPTION_DESCRIPTIONS $RE_JS_WHITE_SPACE $RE_JS_LINE_TERMINATOR $RE_JS_COMMENT $RE_JS_IDENTIFIER_START $RE_JS_IDENTIFIER_PART $RE_JS_IDENTIFIER_NAME $RE_JS_PUNCTUATOR $RE_JS_DIV_PUNCTUATOR $RE_JS_BINARY_OPERATOR $RE_JS_NUMERIC_LITERAL $RE_JS_ESCAPE_SEQUENCE $RE_JS_STRING_LITERAL $RE_JS_STRING_LITERAL_START $RE_JS_STRING_REMAINDER_1 $RE_JS_STRING_REMAINDER_2 $RE_JS_REGULAR_EXPRESSION_LITERAL $RE_JS_TEMPLATE_START $RE_JS_TEMPLATE_MIDDLE_OR_TAIL $RE_JS_TOKEN $RE_JS_INPUT_ELEMENT_DIV $RE_JS_INPUT_ELEMENT_REG_EXP $RE_JS_SKIP $RE_JS_SKIP_NO_LT %RE_JS_SET_TRAPPED_PROPERTIES %RE_JS_SET_RESERVED_WORDS_NON_EXPRESSION %RE_JS_SET_ALL_PUNCTUATORS $JSLIB_BODY $JSLIB_BODY_GZ $HTTP_VERSION $HTTP_1_X $URL $STDIN $STDOUT $ZERO $now $scookie_names $session_usage $session_id $session_id_persistent $session_cookies $packed_flags $encoded_URL $doing_insert_here $env_accept $e_remove_cookies $e_remove_scripts $e_filter_ads $e_insert_entry_form $e_hide_referer $images_are_banned_here $scripts_are_banned_here $cookies_are_banned_here $scheme $authority $path $host $port $username $password $csp $csp_ro $csp_is_supported $cookie_to_server %auth $script_url $url_start $url_start_inframe $url_start_noframe $lang $dir $is_in_frame $expected_type $base_url $base_scheme $base_host $base_path $base_file $base_unframes $default_style_type $default_script_type $status $headers $body $charset $meta_charset $is_html $is_xhtml %in_mini_start_form $does_write $swflib $AVM2_BYTECODES $xhr_origin $temp_counter $debug ) ; #-------------------------------------------------------------------------- # user configuration #-------------------------------------------------------------------------- # #************************************************************************** # # IMPORTANT NOTE: # As of version 2.2.1, you no longer need to edit this file to configure # CGIProxy. Instead, when you run "./nph-proxy.cgi install", a short wizard # will ask you for any required settings, and then will let you set any # of the configuration variables below with a simple command-line menu. The # complete configuration will then be saved in the external file "cgiproxy.conf", # under $PROXY_DIR . # Any time after that, you can run the configuration menu by running # "./nph-proxy.cgi config". # That said, if you do edit the variables below, they will be used as the # initial configuration when "./nph-proxy.cgi install" is first run. Once the # external configuration file has been created, this section will be ignored. # There is one exception: $PROXY_DIR . The installation wizard asks you for # its value, which must be an absolute path. Then, after the wizard copies # this script file into place, it modifies the line below that sets $PROXY_DIR # to reflect what you entered. This special treatment for $PROXY_DIR is # because we can't put it in the configuration file, because we need it to # find the configuration file in the first place. # The section below is still useful, as the comments above each config # variable are the most complete descriptions of them. Look in here # if you need more info than the short descriptions displayed by the # configuration menu. # #************************************************************************** $PROXY_VERSION= '2.2.4' ; # ************************************************************************* # *** This is the only configuration variable that cannot be set by using # *** the configuration menu. Instead, the installation wizard asks # *** you for its value. # ************************************************************************* # For certain purposes, CGIProxy needs to create files. This is where # those will go. # The installation wizard asks for the value of $PROXY_DIR , which must be # an absolute path. Then, after the wizard copies this script file into # place, it modifies the line below to be whatever the proxy owner entered. # If this is set to a relative path before running the installation wizard, # then it will be interpreted relative to the user's home directory. # This directory has to be readable and writeable by the userID that CGIProxy # runs as; that userID is set in the Web server configuration (if this is running # as a CGI script or under mod_perl), or else it's the userID used to start # the FastCGI server or the embedded server. The installation wizard will # try to set permissions appropriately for this. # Note that you need to use "\\" to represent a single backslash, such as in Windows. # Leading drive letters (e.g. for Windows) are allowed. $PROXY_DIR= 'cgiproxy' ; # Most initial configuration is in this routine. (Ignore this if you don't know # what it means.) sub config { # ************************************************************************* # *** Note that these are now set in the configuration wizard, so you don't need to # *** set them here. If you do set them *both* here, these values will be used # *** and the wizard won't ask you questions about them. # ************************************************************************* # If you have root access and can run "./nph-proxy install" as root, then set $RUN_AS_USER # to either the username or numeric user ID that the script will run as, and # set $RUN_AS_GROUP as the group name or numeric group ID that the script will # run as. When run as a CGI script or under mod_perl, these are usually the Web # server's username/group, or possibly the script owner's username/group if using # Apache with the suEXEC feature turned on. # Setting these lets "./nph-proxy install" create the needed directories ($PROXY_DIR # and subdirectories) and a SQLite database file (if using SQLite) with the right # permissions and ownership. # If you're using the embedded server, and you run this script as the root user # in order to use port 443, it's a good idea to change the user and group IDs # to something with fewer permissions. You can do this too by setting # $RUN_AS_USER and $RUN_AS_GROUP. # In any case, these have to be set to an existing user and group on the server, # i.e. CGIProxy doesn't create the user and group if they doesn't already exist. # If these are not set, they will default to the owner and group of this script file. # These are not needed when installing on a Windows server-- you don't need root # access to use port 443 on Windows. #$RUN_AS_USER= 'nobody' ; #$RUN_AS_GROUP= 'nobody' ; # If CGIProxy can be accessed without a path in the URL (e.g. "https://example.com"), # then a censor could check if a site hosts a proxy by merely visiting that URL. # Thus, to help evade censorship, it's better to install this in an undisclosed # subdirectory. $SECRET_PATH is that subdirectory. # If this is left empty here (recommended), a random string will be generated # during initial configuration. # If using CGI or mod_perl, then this subdirectory will be created when you # run "./nph-proxy.cgi install", CGIProxy will be installed in that directory, # and the path in the proxy URL will contain the value of $SECRET_PATH. # If using FastCGI or the embedded server, the URL of your proxy will be # "https://example.com/secret" (replace "secret" with the value of $SECRET_PATH), # but an actual subdirectory will not be created or needed. # Note that this is not a secret from the users, just from anyone watching # network traffic. Also, it won't be kept secret if your server is insecure. #$SECRET_PATH= '' ; # If you don't have root access on your server, this tells CGIProxy where to # install (and later find) Perl modules using the local::lib module. This # is almost always the "perl5" directory under your home directory. # This can be either an absolute path (beginning with "/") or a relative path. # If it's a relative path, it will be relative to the script owner's home # directory (or on Windows, relative to the current user's home directory). # If this is a relative path, it will be converted to an absolute path before # being saved to the configuration file. # Be sure to follow any instructions about the environment variables after you # run "./nph-proxy.cgi install". # On Windows, this will be stored as an absolute path in cgiproxy.conf ; # otherwise, when nph-proxy.cgi runs, it would not know the owning user. $LOCAL_LIB_DIR= 'perl5' ; # NOTE THAT YOU SHOULD BE RUNNING CGIPROXY ON A SECURE SERVER! # Set this to 1 if the script is running on an SSL server, i.e. it is # accessed through a URL starting with "https:"; set this to 0 if it's not # running on an SSL server. This is needed to know how to route URLs back # through the proxy. Regrettably, standard CGI does not yet provide a way # for scripts to determine this without help. # If this variable is set to '' or left undefined, then the program will # guess: SSL is assumed if SERVER_PORT is not 80. This fails when using # an insecure server on a port other than 80, or (less commonly) an SSL server # uses port 80, but usually it works. Besides being a good default, it lets # you install the script where both a secure server and a non-secure server # will serve it, and it will work correctly through either server. # This has nothing to do with retrieving pages that are on SSL servers. $RUNNING_ON_SSL_SERVER= '' ; # If your server doesn't support NPH scripts, then set this variable to true # and try running the script as a normal non-NPH script. # NPH is supported on almost all servers, and it's usually very easy to install # a script as NPH (on Apache, for example, you just need to name the script # something starting with "nph-"). # For this to work, your server MUST support the "Status:" CGI response # header. $NOT_RUNNING_AS_NPH= 0 ; # Set HTTP and SSL proxies if needed. Also see $USE_PASSIVE_FTP_MODE below. # The format of the first two variables is "host:port", with the port being # optional. The format of $NO_PROXY is a comma-separated list of hostnames # or domains: any request for a hostname that ends in one of the strings in # $NO_PROXY will not use the HTTP or SSL proxy; e.g. use ".mycompany.com" to # avoid using the proxies to access any host in the mycompany.com domain. # The environment variables in the examples below are appropriate defaults, # if they are available. Note that earlier versions of this script used # the environment variables directly, instead of the $HTTP_PROXY and # $NO_PROXY variables we use now. # Sometimes you can use the same proxy (like Squid) for both SSL and normal # HTTP, in which case $HTTP_PROXY and $SSL_PROXY will be the same. # $NO_PROXY applies to both SSL and normal HTTP proxying, which is usually # appropriate. If there's demand to differentiate those, it wouldn't be # hard to make a separate $SSL_NO_PROXY option. #$HTTP_PROXY= $ENV{'http_proxy'} ; #$SSL_PROXY= 'firewall.example.com:3128' ; #$NO_PROXY= $ENV{'no_proxy'} ; # If your HTTP and SSL proxies require authentication, this script supports # that in a limited way: you can have a single username/password pair per # proxy to authenticate with, regardless of realm. In other words, multiple # realms aren't supported for proxy authentication (though they are for # normal server authentication, elsewhere). # Set $PROXY_AUTH and $SSL_PROXY_AUTH either in the form of "username:password", # or to the actual base64 string that gets sent in the Proxy-Authorization: # header. Often the two variables will be the same, when the same proxy is # used for both SSL and normal HTTP. #$PROXY_AUTH= 'Aladdin:open sesame' ; #$SSL_PROXY_AUTH= $PROXY_AUTH ; # Set SOCKS proxy if needed. The format of $SOCKS_PROXY is "host:port", with # the port being optional (defaults to 1080). You can also set it to just # a port number (with or without a leading ":"), and it will assume localhost. # If your SOCKS proxy supports username/password authentication, then set # $SOCKS_USERNAME and $SOCKS_PASSWORD too. # NOTE THAT THE CONNECTION BETWEEN THIS SCRIPT AND YOUR SOCKS PROXY MUST BE # TRUSTED, BECAUSE CURRENTLY ALL DATA IS SENT IN THE CLEAR BETWEEN THEM! # In particular, the username and password below will be sent in the clear. # The solution would be to use the GSSAPI authentication method, which many # SOCKS proxies do not support, and which CGIProxy doesn't support yet either. # Special case: If @BANNED_NETWORKS below includes "127" (localhost), and # $SOCKS_PROXY is set to something on localhost, then this script will still # allow itself to connect to $SOCKS_PROXY. This is because most uses of # $SOCKS_PROXY are on localhost, where the connection is normally trusted. #$SOCKS_PROXY= ':1080' ; #$SOCKS_USERNAME= '' ; #$SOCKS_PASSWORD= '' ; # If you're running CGIProxy such that the Web server that the user sees is different # from the Web server CGIProxy is running on (though maybe on the same machine), # the SERVER_PORT environment variable might not be set to the port that the # user is connecting to, and so all the generated URLs will have the wrong # port in them. In this case, you can set $USER_FACING_PORT to the port number # that *should* be in the URLs, i.e. the port that the user connects to. # For example, this would be useful when the user connects to nginx on a server where # nginx then calls an internal Apache process to run this script (perhaps to take # advantage of mod_perl). In such a case, the SERVER_PORT set by Apache will be # the port used for internal nginx-to-Apache communication, not the port the user # connects to nginx with. In this case, you would set $USER_FACING_PORT to the # outward-facing port that nginx listens on. #$USER_FACING_PORT= 443 ; #---- FastCGI configuration --------------------- # FastCGI is a mechanism that can speed up CGI-like scripts. It's purely # optional and requires some web server configuration as well, and if you # don't use it you can ignore this section. # FastCGI uses a local Internet socket to communicate between the FastCGI client # (e.g. the web server software) and the FastCGI server (e.g. a CGI script that # has been converted to run as a listening daemon, such as CGIProxy). # Set this to a port number for this script to listen on as a FastCGI script. # You'll need to set it in your HTTP server's configuration file too (e.g. in # httpd.conf or nginx.conf). For details of that, see # http://www.jmarshall.com/tools/cgiproxy/install.html#fastcgi # This used to use a "Unix-domain socket" instead of an Internet socket, but # there was trouble with the FCGI module and Unix-domain sockets, so as of # CGIProxy 2.1.14 we use an Internet socket. # Note that this no longer requires a ":" at the start, though that is allowed. $FCGI_SOCKET= 8002 ; # FastCGI uses multiple processes to listen on its socket, where each # process can handle one request at a time. This is a performance tuning # parameter, so the optimal number depends on your server environment # (hardware and software). # If you don't understand this, the default should be fine. You can experiment # with different numbers if performance is an issue. # This can be overridden with the "-n" command-line parameter. $FCGI_NUM_PROCESSES= 100 ; # As a FastCGI process gets used for many requests, it slowly takes more and # more memory, due to the copy-on-write behavior of forked processes. Thus, # it's cleaner if you kill a process and restart a fresh one after it handles # some number of requests. This is a performance tuning parameter, so the # optimal number depends on your server environment (hardware and software). # If you don't understand this, the default should be fine. You can experiment # with different numbers if performance is an issue. # This can be overridden with the "-m" command-line parameter. $FCGI_MAX_REQUESTS_PER_PROCESS= 1000 ; #---- End of FastCGI configuration -------------- #---- Embedded server configuration ------------- # For the embedded server, you need to a) put a certificate and private key, # in PEM format, into the $PROXY_DIR directory, and b) set these two # variables to the two file names. (A "certificate" is the same thing as # a public key.) # You can either pay a certificate authority for a key pair, or you can # generate your own "self-signed" key pair. The disadvantage of using a # self-signed key pair is that your users will see a browser warning about # an untrusted certificate. This is all true of any secure server. $CERTIFICATE_FILE= 'plain-cert.pem' ; $PRIVATE_KEY_FILE= 'plain-key.pem' ; # It's important to use $SECRET_PATH, but you can require a username and # password too. All users must login with whatever you set below, using # HTTP Basic authentication. Leave these commented out to disable # password protection. # This is very simple right now. In the future there will likely be # more authentication methods, including support for multiple users. #$EMB_USERNAME= 'free' ; #$EMB_PASSWORD= 'speech' ; #---- End of embedded server configuration ------ #---- Database configuration -------------------- # Database use is optional, and if you don't use one you can ignore this # section. But if you're getting "Bad Request" errors, you can fix it # by using a database; also, see the $USE_DB_FOR_COOKIES option below. # Database use is optional. It's most efficient when this script is running # under mod_perl or FastCGI. # The easiest database to use is SQLite. While normal database engines like # MySQL/MariaDB or Oracle require a constantly running process and some # configuration by the system administrator, SQLite requires none of this-- # it reads and writes directly to database files in your own directory, as # protected by the operating system permissions. Because of its ease of # configuration, SQLite is the default database here. # If you're using a database other than SQLite, create a database user account # for this program to use, or ask your database administrator to do it. Set # $DB_USER and $DB_PASS to the username and password, below. This program # will try to create the required database, named $DB_NAME as set below, but # if your DBA isn't willing to grant the permission to create databases to # the CGIProxy user, then you or the DBA will need to create the database. # This can be done with the SQL command "CREATE DATABASE cgiproxy;" (or # whatever you set $DB_NAME to below). # # If you are using a database of any kind, it must be purged periodically. In # Unix or Mac, do this with a cron job. In Windows, use the Task Scheduler. # In Unix or Mac, the command to purge the database is # "/path/to/script/nph-proxy.cgi purge-db". (Replace "/path/to/script/" # with the actual path to the script.) Edit your crontab with "crontab -e", # and add a line like: # "0 * * * * /path/to/script/nph-proxy.cgi purge-db" (without quotes) # to purge the database at the top of every hour, or: # "0 2 * * * /path/to/script/nph-proxy.cgi purge-db" (without quotes) # to purge it every night at 2:00am. # This is the name of the "database driver" for the database software you're using. # Currently supported values are "SQLite", "MySQL" and "Oracle". # The default of "SQLite" is the easiest to use. SQLite lets you have database # functionality by directly reading and writing a database file, without requiring # a full database engine like MySQL/MariaDB or Oracle to run on your server. # Note that it is potentially insecure to use a database if there are other # untrusted people with accounts on the same server, especially if they can read # this script file and the database password below. The easiest way to securely # use a database is to have your own server with no untrusted user having shell # access on it. If this isn't practical, then you need to set file permissions # appropriately on both this script file and any SQLite database file: set # permissions (and file ownership and group ownership) on both files to be # accessible by the web server's userID, but not accessible by anyone else on # the same server. Note that running this on a virtual private server isn't # insecure in this way-- even though a VPS is a shared machine, other people # can't see your files (except the sysadmin). # Set this to "" or comment it out to not use a database. Note that you will # probably see "Bad Request" errors when you accumulate too many cookies; using # a database solves this problem, or you can periodically clear your cookies. $DB_DRIVER= 'SQLite' ; # If your database (other than SQLite) is running on a remote server, or on a # non-default port, set this to "dbserver:port", where dbserver is the name # or IP address of your database server, and port is the port it is listening # on. If dbserver is empty (as in ":port"), then it defaults to localhost; # if port is empty (as in "dbserver:" or just "dbserver"), then it defaults # to 3306 for MySQL, or 1521 for Oracle. #$DB_SERVER= "localhost:3306" ; # CGIProxy creates (if possible) and uses its own database. If you want to name # the database something else, change this value. If you need a database # administrator to create the database, tell them this database name. # This value must only contain letters, numbers, and the "_" character. $DB_NAME= 'cgiproxy' ; # These are the username and password of the database account, as described above. # If you're using SQLite, you don't need to set these-- access to the SQLite # database files is controlled by the permissions of the filesystem. #$DB_USER= 'proxy' ; #$DB_PASS= '' ; # If set, then use the server-side database to store cookies. This gets around # the problem of too many total cookies causing "Bad Request" errors. # Set this to 1 to use the database (if it's configured), or to 0 to NOT use # the database. $USE_DB_FOR_COOKIES= 1 ; #---- End of database configuration ------------- # This is the default language to use for all CGIProxy messages, until the user # clicks on a flag in the start form. $DEFAULT_LANG= 'en' ; # If set, then proxy traffic will be restricted to text data only, to save # bandwidth (though it can still be circumvented with uuencode, etc.). # To replace images with a 1x1 transparent GIF, set $RETURN_EMPTY_GIF below. $TEXT_ONLY= 0 ; # set to 1 to allow only text data, 0 to allow all # If set, then prevent all cookies from passing through the proxy. To allow # cookies from some servers, set this to 0 and see @ALLOWED_COOKIE_SERVERS # and @BANNED_COOKIE_SERVERS below. You can also prevent cookies with # images by setting $NO_COOKIE_WITH_IMAGE below. # Note that this only affects cookies from the target server. The proxy # script sends its own cookies for other reasons too, like to support # authentication. This flag does not stop these cookies from being sent. $REMOVE_COOKIES= 0 ; # If set, then remove as much scripting as possible. If anonymity is # important, this is strongly recommended! Better yet, turn off script # support in your browser. # On the HTTP level: # . prevent transmission of script MIME types (which only works if the server # marks them as such, so a malicious server could get around this, but # then the browser probably wouldn't execute the script). # . remove Link: headers that link to a resource of a script MIME type. # Within HTML resources: # . remove . # . remove intrinsic event attributes from tags, i.e. attributes whose names # begin with "on". # . remove where "type" attribute is a script MIME type. # . remove various HTML tags that appear to link to a script MIME type. # . remove script macros (aka Netscape-specific "JavaScript entities"), # i.e. any attributes containing the string "&{" . # . remove "JavaScript conditional comments". # . remove MSIE-specific "dynamic properties". # To allow scripts from some sites but not from others, set this to 0 and # see @ALLOWED_SCRIPT_SERVERS and @BANNED_SCRIPT_SERVERS below. # See @SCRIPT_MIME_TYPES below for a list of which MIME types are filtered out. # I do NOT know for certain that this removes all script content! It removes # all that I know of, but I don't have a definitive list of places scripts # can exist. If you do, please send it to me. EVEN RUNNING A SINGLE # JAVASCRIPT STATEMENT CAN COMPROMISE YOUR ANONYMITY! Just so you know. # Richard Smith has a good test site for anonymizing proxies, at # http://users.rcn.com/rms2000/anon/test.htm # Note that turning this on removes most popup ads! :) $REMOVE_SCRIPTS= 0 ; # If set, then filter out images that match one of @BANNED_IMAGE_URL_PATTERNS, # below. Also removes cookies attached to images, as if $NO_COOKIE_WITH_IMAGE # is set. # To remove most popup advertisements, also set $REMOVE_SCRIPTS=1 above. $FILTER_ADS= 0 ; # If set, then don't send a Referer: [sic] header with each request # (i.e. something that tells the server which page you're coming from # that linked to it). This is a minor privacy issue, but a few sites # won't send you pages or images if the Referer: is not what they're # expecting. If a page is loading without images or a link seems to be # refused, then try turning this off, and a correct Referer: header will # be sent. # This is only a problem in a VERY small percentage of sites, so few that # I'm kinda hesitant to put this in the entry form. Other arrangements # have their own problems, though. $HIDE_REFERER= 0 ; # If set, insert a compact version of the URL entry form at the top of each # page. This will also display the URL currently being viewed. # When viewing a page with frames, then a new top frame is created and the # insertion goes there. # If you want to customize the appearance of the form, modify the routine # mini_start_form() near the end of the script. # If you want to insert something other than this form, see $INSERT_HTML and # $INSERT_FILE below. # Users should realize that options changed via the form only take affect when # the form is submitted by entering a new URL or pressing the "Go" button. # Selecting an option, then following a link on the page, will not cause # the option to take effect. # Users should also realize that anything inserted into a page may throw # off any precise layout. The insertion will also be subject to # background colors and images, and any other page-wide settings. $INSERT_ENTRY_FORM= 1 ; # If set, then allow the user to control $REMOVE_COOKIES, $REMOVE_SCRIPTS, # $FILTER_ADS, $HIDE_REFERER, and $INSERT_ENTRY_FORM. Note that they # can't fine-tune any related options, such as the various @ALLOWED... and # @BANNED... lists. $ALLOW_USER_CONFIG= 1 ; # Use @ALLOWED_SERVERS and @BANNED_SERVERS to restrict which servers a user # can visit through this proxy. Any URL at a host matching a pattern in # @BANNED_SERVERS will be forbidden. In addition, if @ALLOWED_SERVERS is # not empty, then access is allowed *only* to servers that match a pattern # in it. In other words, @BANNED_SERVERS means "ban these servers", and # @ALLOWED_SERVERS (if not empty) means "allow only these servers". If a # server matches both lists, it is banned. # These are each a list of Perl 5 regular expressions (aka patterns or # regexes), not literal host names. To turn a hostname into a pattern, # replace every "." with "\.", add "^" to the beginning, and add "$" to the # end. For example, 'www.example.com' becomes '^www\.example\.com$'. To # match *every* host ending in something, leave out the "^". For example, # '\.example\.com$' matches every host ending in ".example.com". For more # details about Perl regular expressions, see the Perl documentation. (They # may seem cryptic at first, but they're very powerful once you know how to # use them.) # Note: Use single quotes around each pattern, not double qoutes, unless you # understand the difference between the two in Perl. Otherwise, characters # like "$" and "\" may not be handled the way you expect. @ALLOWED_SERVERS= () ; @BANNED_SERVERS= () ; # If @BANNED_NETWORKS is set, then forbid access to these hosts or networks. # This is done by IP address, not name, so it provides more certain security # than @BANNED_SERVERS above. # Specify each element as a decimal IP address-- all four integers for a host, # or one to three integers for a network. For example, '127.0.0.1' bans # access to the local host, and '192.168' bans access to all IP addresses # in the 192.168 network. Sorry, no banning yet for subnets other than # 8, 16, or 24 bits. # IF YOU'RE RUNNING THIS ON OR INSIDE A FIREWALL, THIS SETTING IS STRONGLY # RECOMMENDED!! In particular, you should ban access to other machines # inside the firewall that the firewall machine itself may have access to. # Otherwise, external users will be able to access any internal hosts that # the firewall can access. Even if that's what you intend, you should ban # access to any hosts that you don't explicitly want to expose to outside # users. # In addition to the recommended defaults below, add all IP addresses of your # server machine if you want to protect it like this. # Special case: If @BANNED_NETWORKS below includes "127" (localhost), and # $SOCKS_PROXY is set to something on localhost, then this script will still # allow itself to connect to $SOCKS_PROXY. This is because most uses of # $SOCKS_PROXY are on localhost, where the connection is normally trusted. # After you set this, YOU SHOULD TEST to verify that the proxy can't access # the IP addresses you're banning! # NOTE: According to RFC 1918, network address ranges reserved for private # networks are 10.x.x.x, 192.168.x.x, and 172.16.x.x-172.31.x.x, i.e. with # respective subnet masks of 8, 16, and 12 bits. Since we can't currently # do a 12-bit mask, we exclude the 16 172.x subnets this applies to-- # inefficient, but works for now, until we support subnet masks. # Also included are 169.254.x.x (per RFC 3927) and 244.0.0.x (used for # routing), as recommended by Waldo Jaquith. # On some systems, 127.x.x.x all point to localhost, so disallow all of "127". # This feature is simple now but may be more complete in future releases. # How would you like this to be extended? What would be useful to you? @BANNED_NETWORKS= ('127', '192.168', '10', '169.254', '244.0.0', '172.16', '172.17', '172.18', '172.19', '172.20', '172.21', '172.22', '172.23', '172.24', '172.25', '172.26', '172.27', '172.28', '172.29', '172.30', '172.31' ) ; # Settings to fine-tune cookie filtering, if cookies are not banned altogether # (by user checkbox or $REMOVE_COOKIES above). # Use @ALLOWED_COOKIE_SERVERS and @BANNED_COOKIE_SERVERS to restrict which # servers can send cookies through this proxy. They work like # @ALLOWED_SERVERS and @BANNED_SERVERS above, both in how their precedence # works, and that they're lists of Perl 5 regular expressions. See the # comments there for details. # If non-empty, only allow cookies from servers matching one of these patterns. # Comment this out to allow all cookies (subject to @BANNED_COOKIE_SERVERS). #@ALLOWED_COOKIE_SERVERS= ('\bslashdot\.org$') ; # Reject cookies from servers matching these patterns. @BANNED_COOKIE_SERVERS= ( '\.doubleclick\.net$', '\.preferences\.com$', '\.imgis\.com$', '\.adforce\.com$', '\.focalink\.com$', '\.flycast\.com$', '\.avenuea\.com$', '\.linkexchange\.com$', '\.pathfinder\.com$', '\.burstnet\.com$', '\btripod\.com$', '\bgeocities\.yahoo\.com$', '\.mediaplex\.com$', ) ; # Settings to fine-tune script filtering, if scripts are not banned altogether # (by user checkbox or $REMOVE_SCRIPTS above). # Use @ALLOWED_SCRIPT_SERVERS and @BANNED_SCRIPT_SERVERS to restrict which # servers you'll allow scripts from. They work like @ALLOWED_SERVERS and # @BANNED_SERVERS above, both in how their precedence works, and that # they're lists of Perl 5 regular expressions. See the comments there for # details. @ALLOWED_SCRIPT_SERVERS= () ; @BANNED_SCRIPT_SERVERS= () ; # Various options to help filter ads and stop cookie-based privacy invasion. # These are only effective if $FILTER_ADS is set above. # @BANNED_IMAGE_URL_PATTERNS uses Perl patterns. If an image's URL # matches one of the patterns, it will not be downloaded (typically for # ad-filtering). For more information on Perl regular expressions, see # the Perl documentation. # Note that most popup ads will be removed if scripts are removed (see # $REMOVE_SCRIPTS above). # If ad-filtering is your primary motive, consider using one of the many # proxies that specialize in that. The classic is from JunkBusters, at # http://www.junkbusters.com . # Reject images whose URL matches any of these patterns. This is just a # sample list; add more depending on which sites you visit. @BANNED_IMAGE_URL_PATTERNS= ( 'ad\.doubleclick\.net/ad/', '\b[a-z](\d+)?\.doubleclick\.net(:\d*)?/', '\.imgis\.com\b', '\.adforce\.com\b', '\.avenuea\.com\b', '\.go\.com(:\d*)?/ad/', '\.eimg\.com\b', '\bexcite\.netscape\.com(:\d*)?/.*/promo/', '/excitenetscapepromos/', '\.yimg\.com(:\d*)?.*/promo/', '\bus\.yimg\.com/[a-z]/(\w\w)/\1', '\bus\.yimg\.com/[a-z]/\d-/', '\bpromotions\.yahoo\.com(:\d*)?/promotions/', '\bcnn\.com(:\d*)?/ads/', 'ads\.msn\.com\b', '\blinkexchange\.com\b', '\badknowledge\.com\b', '/SmartBanner/', '\bdeja\.com/ads/', '\bimage\.pathfinder\.com/sponsors', 'ads\.tripod\.com', 'ar\.atwola\.com/image/', '\brealcities\.com/ads/', '\bnytimes\.com/ad[sx]/', '\busatoday\.com/sponsors/', '\busatoday\.com/RealMedia/ads/', '\bmsads\.net/ads/', '\bmediaplex\.com/ads/', '\batdmt\.com/[a-z]/', '\bview\.atdmt\.com/', '\bADSAdClient31\.dll\b', ) ; # Normally, if a user tries to access a banned server or use an unsupported # If set, replace banned images with 1x1 transparent GIF. This also replaces # all images with the same if $TEXT_ONLY is set. # Note that setting this makes the response a little slower, since the browser # must still retrieve the empty GIF. $RETURN_EMPTY_GIF= 0 ; # Set this to reject cookies returned with images. This actually prevents # cookies returned with any non-text resource. # This helps prevent tracking by ad networks, but there are also some # legitimate uses of attaching cookies to images, such as captcha, so # by default this is off. $NO_COOKIE_WITH_IMAGE= 0 ; # Normally, if a user tries to access a banned server or use an unsupported # scheme (protocol), this script will alert the user with a warning page, and # either allow the user to click through to the URL unprotected (i.e. without # using the proxy), or ban access altogether. However, in some VPN-like # installations, it may more desirable to let users follow links from # protected pages (e.g. within an intranet) that lead to unprotected, # unproxified pages (e.g. pages outside of the intranet), with no breaks in # the browsing experience. (This example assumes the proxy owner intends it # to be used for browsing only the intranet and not the Internet at large.) # Set $QUIETLY_EXIT_PROXY_SESSION to skip any warning message and let the # user surf directly to unproxified pages from proxified pages. Note that # this somewhat changes the meaning of @ALLOWED_SERVERS and @BANNED_SERVERS-- # they're not allowed or banned per se, it's just whether this proxy is # willing to handle their traffic. @BANNED_NETWORKS is unaffected, however, # since the IP ranges it contains often make no sense outside of the LAN. # WARNING: DO *NOT* SET THIS FLAG IF ANONYMITY IS IMPORTANT AT ALL!!! IT IS # NOT MEANT FOR THAT KIND OF INSTALLATION. IF THIS IS SET, THEN USERS WILL # SURF INTO UNPROXIFIED, UNANONYMIZED PAGES WITH NO WARNING, AND THEIR # PRIVACY WILL BE COMPROMISED; THEY MAY NOT EVEN NOTICE FOR A LONG TIME. # THIS IS EXACTLY WHAT ANONYMIZING PROXIES ARE CREATED TO AVOID. $QUIETLY_EXIT_PROXY_SESSION= 0 ; # If set, then modify script content (like JavaScript) as well as possible # such that network accesses go through this proxy script. If not set, then # allow script content to pass unmodified, assuming it's not being removed. # Currently, JavaScript is the only script content that's proxified. # If this is set, and if you modify proxy_encode() and proxy_decode(), then # you MUST modify the JavaScript routines in $ENCODE_DECODE_BLOCK_IN_JS also. # NOTE: This proxification of script content may not be perfect. It's pretty # good, but it may be possible to construct malicious JavaScript that reveals # your identity to the server. The purpose of this feature is more to allow # scripts to function through the proxy, than to provide bulletproof # anonymity. # The best advice remains: FOR BEST ANONYMITY, BROWSE WITH SCRIPTS TURNED OFF. $PROXIFY_SCRIPTS= 1 ; # If set, then modify ShockWave Flash resources as well as possible such that # network accesses go through this proxy script. If not set, then allow # SWF resources to pass unmodified. # NOTE: This is still experimental, and the modified SWF apps are sometimes # much slower than the unproxified SWF apps. If this is turned on, then # Web pages with SWF may run much more slowly and possibly bog down # your browser, even if the rest of the page is fast. Remember that SWF # apps are pretty common in ads and other places in the page that we tend # to ignore. $PROXIFY_SWF= 1 ; # If this is set, then the URL the user enters in the start form or the top # form will be encoded by _proxy_jslib_proxy_encode() before it's submitted. # This can keep the URL the user visits private. # Note that if you set this, you need to modify proxy_encode() above (along # with proxy_decode() and the two analogous JavaScript routines) if you # want the URL to actually be encoded to something non-obvious. $ENCODE_URL_INPUT= 1 ; # To use an external program to decide whether or not a user at a given IP # address may use this proxy (as opposed to using server configuration), set # $USER_IP_ADDRESS_TEST to either the name of a command-line program that # performs this test, or a queryable URL that performs this test (e.g. a CGI # script). # For a command-line program: The program should take a single argument, the # IP address of the user. The output of the program is evaluated as a # number, and if the number is non-zero then the IP address of the user is # allowed; thus, the output is typically either "1" or "0". Note that # depending on $ENV{PATH}, you may need to enter the path here explicitly. # For a queryable URL: Specify the start of the URL here (must begin with # "http://"), and the user's IP address will be appended. For example, the # value here may contain a "?", thus putting the IP address in the # QUERY_STRING; it could also be in PATH_INFO. The response body from the # URL should be a number like for a command line program, above. $USER_IP_ADDRESS_TEST= '' ; # To use an external program to decide whether or not a destination server is # allowed (as opposed to using @ALLOWED_SERVERS and @BANNED_SERVERS above), # set $DESTINATION_SERVER_TEST to either the name of a command-line program # that performs this test, or a queryable URL that performs this test (e.g. a # CGI script). # For a command-line program: The program should take a single argument, the # destination server's name or IP address (depending on how the user enters # it). The output of the program is evaluated as a number, and if the number # is non-zero then the destination server is allowed; thus, the output is # typically either "1" or "0". Note that depending on $ENV{PATH}, you may # need to enter the path here explicitly. # For a queryable URL: Specify the start of the URL here (must begin with # "http://"), and the destination server's name or IP address will be # appended. For example, the value here may contain a "?", thus putting the # name or address in the QUERY_STRING; it could also be in PATH_INFO. The # response body from the URL should be a number like for a command line # program, above. $DESTINATION_SERVER_TEST= '' ; # This is one way to handle pages that don't work well, by redirecting to other working # versions of the pages (for example, to a mobile version or another version that # doesn't have much JavaScript). How it works: If the current domain matches one # of the keys of %REDIRECTS, then s/// (string substitution) is done on the URL, # using the match and replacement patterns in the 2-element value array. # The set of sites handled this way is Facebook and Gmail, since they doesn't # always work well, or are slow, through CGIProxy. If you want to access # them normally, then comment out or remove the line(s) below for that site. # If you want to redirect more sites, you can add records to the %REDIRECTS # hash in the following way: Set the hash key to the name of the server you # want to redirect, and the value to a reference to a 2-element array containing # the left and right sides of an s/// string substitution. If that doesn't make # sense, then try to emulate an example below. # As of version 2.1.7, the full facebook.com site works pretty well, so the # redirection below has been commented out. # ... aaaand, as of version 2.1.8, the full Gmail site works pretty well, so the # redirection below has been commented out. # To improve performance with facebook or other JS-busy sites, users can: # - close other browser windows # - end other CPU-heavy processes on their browsing machine # - reload the page or restart the browser when it gets too slow # - use a browser other than MSIE (it has the most problems) # If Gmail or facebook is still too slow or crashes a lot, you can remove the # leading "#" on the appropriate lines below to automatically redirect to # Gmail's HTML-only site or facebook's mobile site, which may work better. %REDIRECTS= ( # 'www.facebook.com' => [qr#^https?://www\.facebook\.com#i, 'https://m.facebook.com'], # 'mail.google.com' => [qr#^https?://mail\.google\.com/.*shva=\w*1.*$#i, 'https://mail.google.com/?ui=html'] ) ; # If either $INSERT_HTML or $INSERT_FILE is set, then that HTML text or the # contents of that named file (respectively) will be inserted into any HTML # page retrieved through this proxy. $INSERT_HTML takes precedence over # $INSERT_FILE. $INSERT_FILE is assumed to have contents in UTF-8. # When viewing a page with frames, a new top frame is created and the # insertions go there. # NOTE: Any HTML you insert should not have relative URLs in it! The problem # is that there is no appropriate base URL to resolve them with. So only use # absolute URLs in your insertion. (If you use relative URLs anyway, then # a) if $ANONYMIZE_INSERTION is set, they'll be resolved relative to this # script's URL, which isn't great, or b) if $ANONYMIZE_INSERTION==0, # they'll be unchanged and the browser will simply resolve them relative # to the current page, which is usually worse.) # The frame handling means that it's fairly easy for a surfer to bypass this # insertion, by pretending in effect to be in a frame. There's not much we # can do about that, since a page is retrieved the same way regardless of # whether it's in a frame. This script uses a parameter in the URL to # communicate to itself between calls, but the user can merely change that # URL to make the script think it's retrieving a page for a frame. Also, # many browsers let the user expand a frame's contents into a full window. # [The warning in earlier versions about setting $INSERT_HTML to '' when using # mod_perl and $INSERT_FILE no longer applies. It's all handled elsewhere.] # As with $INSERT_ENTRY_FORM, note that any insertion may throw off any # precise layout, and the insertion is subject to background colors and # other page-wide settings. #$INSERT_HTML= "

This is an inserted header


" ; #$INSERT_FILE= 'insert_file_name' ; # If your insertion has links that you don't want anonymized along with the rest # of the downloaded HTML, then set this to 0. Otherwise leave it at 1. $ANONYMIZE_INSERTION= 1 ; # If there's both a URL entry form and an insertion via $INSERT_HTML or # $INSERT_FILE on the same page, the entry form normally goes at the top. # Set this to put it after the other insertion. $FORM_AFTER_INSERTION= 0 ; # Here's an experimental feature that may or may not be useful. It's trivial # to add, so I added it. It was inspired in part by Mike Reiter's and Avi # Rubin's "Crowds", at http://www.research.att.com/projects/crowds/ . # Let me know if you find a use for it. # The idea is that you have a number of mutually-trusting, cooperating # proxies that you list in @PROXY_GROUP(). If that is set, then instead # of rerouting all URLs back through this proxy, the script will choose # one of these proxies at random to reroute all URLs through, for each # run. This could be used to balance the load among several proxies, for # example. Under certain conditions it could conceivably help privacy by # making it harder to track a user's session, but under certain other # conditions it could make it easier, depending on how many people, # proxies, and proxy servers are involved. For each page, both its # included images and followed links will go through the same proxy, so a # clever target server could determine which proxy servers are in each # group. # proxy_encode() and proxy_decode() must be the same for all proxies in the # group. Same goes for pack_flags() and unpack_flags() if you modified them, # and probably certain other routines and configuration options. # Cookies and Basic authentication can't be supported with this, sorry, since # cookies can only be sent back to the proxy that created them. # Set this to a list of absolute URLs of proxies, ending with "nph-proxy.cgi" # (or whatever you named the script). Be sure to include the URL of this # proxy, or it will never redirect back through here. Each proxy in the # group should have the same @PROXY_GROUP. # Alternately, you could set each proxy's @PROXY_GROUP differently for more # creative configuration, such as to balance the load unevenly, or to send # users through a "round-robin" cycle of proxies. #@PROXY_GROUP= ('http://www.example.com/~grommit/proxy/nph-proxy.cgi', # 'http://www.fnord.mil/langley/bavaria/atlantis/nph-proxy.cgi', # 'http://www.nothinghere.gov/No/Such/Agency/nph-proxy.cgi', # ) ; # Normally, each cookie includes an expiration time/date, and the cookie stays # in effect until then, even after you exit your browser and restart it # (which normally means the cookie is stored on the hard drive). Any cookie # that has no explicit expiration date is a "session cookie", and stays in # effect only as long as the browser is running, and presumably is forgotten # after that. If you set $SESSION_COOKIES_ONLY=1, then *all* cookies that # pass through this proxy will be changed to session cookies. This is useful # at a public terminal, or wherever you don't want your cookies to remain # after you exit the browser. # NOTE: The clock on the server where this runs must be correct for this # option to work right! It doesn't have to be exact, but don't have it off # by hours or anything like that. The problem is that we must not alter any # cookies set to expire in the past, because that's how sites delete cookies. # If a cookie is being deleted, we DON'T want to turn it into a session # cookie. So this script will not alter any cookies set to expire before the # current time according to the system clock. $SESSION_COOKIES_ONLY= 0 ; # Normally, your browser stores all pages you download in your computer's # hard drive and memory, in the "cache". This saves a lot of time and # bandwidth the next time you view the page (especially with images, which # are bigger and may be shared among several pages). However, in some # situations you may not want the pages you've visited to be stored. If # $MINIMIZE_CACHING is set, then this proxy will try its best to prevent any # caching of anything retrieved through it. # NOTE: This cannot guarantee that no caching will happen. All we can do is # instruct the browser not to cache anything. A faulty or malicious browser # could cache things anyway if it chose to. # NOTE: This has nothing to do with your browser's "history list", which may # also store a list of URLs you've visited. # NOTE: If you use this, you will use a lot more bandwidth than without it, # and pages will seemingly load slower, because if a browser can't cache # anything locally then it has to load everything across the network every # time it needs something. $MINIMIZE_CACHING= 0 ; # Set $USER_AGENT to something generic like this if you want to be extra # careful. Conceivably, revealing which browser you're using may be a # slight privacy or security risk. # However, note that some URLs serve different pages depending on which # browser you're using, so some pages will change if you set this. # This defaults to the user's HTTP_USER_AGENT. #$USER_AGENT= 'Mozilla/4.05 [en] (X11; I; Linux 2.0.34 i586)' ; # When handling HTML resources, CGIProxy downloads the entire resource before # modifying it and returning it to the client. However, some operations # (such as time-intensive queries) return the first part of a page while # still generating the last part. On such pages, the user might like to # see that first part without waiting for the entire response, which they # would normally have to do when using CGIProxy. So, if this option is set, # then CGIProxy will return proxified HTML parts as soon as it receives them # from the server. This is less efficient; for example, it means that every # page will have the JavaScript library inserted, even if it's not needed # (though that wouldn't be too bad since the library is normally cached # anyway). So, we want to do this only for certain pages and not for all. # Thus, set this to a list of patterns that match URLs you want to handle # this way. The patterns work like @ALLOWED_SERVERS and @BANNED_SERVERS # above, in that they're lists of Perl 5 regular expressions. See the # comments there for details. # The sample webfeat.org pattern is appropriate for libraries who use the # WebFeat service. #@TRANSMIT_HTML_IN_PARTS_URLS= ( # '^https?://search3\.webfeat\.org/cgi-bin/WebFeat\.dll', # ) ; # FTP transfers can happen in either passive or non-passive mode. Passive # mode works better if the client (this script) is behind a firewall. Some # people consider passive mode to be more secure, too. But in certain # network configurations, if this script has trouble connecting to FTP # servers, you can turn this off to try non-passive mode. # See http://cr.yp.to/ftp/security.html for a discussion of security issues # regarding passive and non-passive FTP. $USE_PASSIVE_FTP_MODE= 1 ; # Unlike a normal browser which can keep an FTP session open between requests, # this script must make a new connection with each request. Thus, the # FTP welcome message (e.g. the README file) will be received every time; # there's no way for this script to know if you've been here before. Set # $SHOW_FTP_WELCOME to true to always show the welcome message, or false # to never show it. $SHOW_FTP_WELCOME= 1 ; # Comments may contain HTML in them, which shouldn't be rendered but may be # relevant in some other way. Set this flag if you want the contents of # comments to be proxified like the rest of the page, i.e. proxify URLs, # stylesheets, scripts, etc. $PROXIFY_COMMENTS= 0 ; # Apparently, some censoring filters search outgoing request URIs, but not # POST request bodies. Set this to make the initial input form submit # using POST instead of GET. $USE_POST_ON_START= 1 ; # Apparently, some censoring filters look at titles on HTML pages. Set this # to remove HTML page titles. # Note that this does NOT remove titles that are generated by script content, # since those would have no effect on a filter. $REMOVE_TITLES= 0 ; # If set, this option prevents a user from calling the proxy through the # proxy itself, i.e. looping. It's normally a mistake on the user's part, # and a waste of resources. # This isn't foolproof; it just catches the obvious mistakes. It's probably # pretty easy for a malicious user to make the script call itself, or s/he # can always use two proxies to call each other in a loop. This doesn't # account for IP addresses or multiple hostnames for the same server. $NO_BROWSE_THROUGH_SELF= 0 ; # Set this to leave out the "Restart" link at the bottom of error pages, etc. # In some situations this could make it harder for search engines to find the # start page. $NO_LINK_TO_START= 0 ; # For the obscure case when a POST must be repeated because of user # authentication, this is the max size of the request body that this # script will store locally. If CONTENT_LENGTH is bigger than this, # the body's not saved at all-- the first POST will be correct, but # the second will not happen at all (since a partial POST is worse than # nothing). $MAX_REQUEST_SIZE= 16777216 ; # that's 16 Meg to you and me # Though JavaScript is by far the most common kind of script, there are other # kinds too, such as Microsoft's VBScript. This program proxifies JavaScript # content, but not other script content, which means those other scripts # could open privacy holes. Thus, the default behavior of this program is # to remove those other scripts. Set this variable to true if you'd rather # let those scripts through. # How this works with $REMOVE_SCRIPTS and the "remove scripts" user checkbox: # If $ALLOW_UNPROXIFIED_SCRIPTS is false, then unsupported scripts will # always be removed. If it is true, then it is subject to those other # settings, just like supported script types are. # For now, this also controls whether unproxified SWF (Flash) apps are allowed # through the proxy. This means that by default, SWF apps are removed # from pages. This is the safest, but may leave some pages looking # incomplete. If you want to display SWF apps, then you need to set either # $PROXIFY_SWF or $ALLOW_UNPROXIFIED_SCRIPTS . This arrangement can change # if there is demand. $ALLOW_UNPROXIFIED_SCRIPTS= 0 ; # Cookies have a URL path associated with them; it determines which URLs on a # server will receive the cookie in requests. If the path is not specified # when the cookie is created, then the path is supposed to default to the # path of the URL that the cookie was retrieved with, according to the # cookie specification from Netscape. Unfortunately, most browsers seem # to ignore the spec and instead give cookies a default path of "/", i.e. # "send this cookie with all requests to this server". So, *sigh*, this # script uses "/" as the default path also. If you want this script to # follow the specification instead, then set this variable to true. $COOKIE_PATH_FOLLOWS_SPEC= 0 ; # Technically, cookies must have a domain containing at least two dots if the # TLD is one of the main non-national TLD's (.com, .net, etc.), and three # dots otherwise. This is to prevent malicious servers from setting cookies # for e.g. the entire ".co.uk" domain. Unfortunately, this prescribed # behavior does not accommodate domains like ".google.de". Thus, browsers # seem to not require three dots, and thus, this script will do the same by # default. Set $RESPECT_THREE_DOT_RULE if you want the strictly correct # behavior instead. $RESPECT_THREE_DOT_RULE= 0 ; # Content Security Policy (CSP) is indicated by the Content-Security-Policy: # HTTP response header, which CGIProxy has both used and supported since # version 2.1.9 . Normally, any attempted violation of it is reported only # in the JavaScript console, i.e. invisible to most users. If you want to # show a message when a violation happens (e.g. when testing), set this to # true. $ALERT_ON_CSP_VIOLATION= 0 ; # Some JavaScript-busy sites crash when visiting them through CGIProxy. Increasing # the delay times in Window.setTimeout() and Window.setInterval() makes them not # crash as much, but it also makes certain page actions slower. You can set # %TIMEOUT_MULTIPLIER_BY_HOST for each problematic server, and those timeout # functions on those sites will have their delays multiplied by that amount. For # example, pages on www.facebook.com will have their delay times multiplied by 10 # by default. # Any sites not listed here will not have their delay times changed. %TIMEOUT_MULTIPLIER_BY_HOST= ( 'www.facebook.com' => 10, ) ; # To support video in Flash 9+, this program spawns a specialized RTMP proxy # daemon that listens on a port (1935 if possible) and dies after 10 minutes # of no connections. This is useful, but some sysadmins may not like it. # If you want to prevent the daemon, set $ALLOW_RTMP_PROXY=0 . Note that # Flash 9+ video won't always work if you do so. # As of release 2.1, the RTMP proxy isn't used yet, so turn it off. $ALLOW_RTMP_PROXY= 0 ; } # sub config # WARNING: # EXCEPT UNDER RARE CIRCUMSTANCES, ANY PROXY WHICH HANDLES SSL REQUESTS # SHOULD *ONLY* RUN ON AN SSL SERVER!!! OTHERWISE, YOU'RE RETRIEVING # PROTECTED PAGES BUT SENDING THEM BACK TO THE USER UNPROTECTED. THIS # COULD EXPOSE ANY INFORMATION IN THOSE PAGES, OR ANY INFORMATION THE # USER SUBMITS TO A SECURE SERVER. THIS COULD HAVE SERIOUS CONSEQUENCES, # EVEN LEGAL CONSEQUENCES. IT UNDERMINES THE WHOLE PURPOSE OF SECURE # SERVERS. # THE *ONLY* EXCEPTION IS WHEN YOU HAVE *COMPLETE* TRUST OF THE LINK # BETWEEN THE BROWSER AND THE SERVER THAT RUNS THE SSL-HANDLING PROXY, # SUCH AS ON A CLOSED LAN, OR IF THE PROXY RUNS ON THE SAME MACHINE AS # THE BROWSER. # IF YOU ARE ABSOLUTELY SURE THAT YOU TRUST THE USER-TO-PROXY LINK, YOU # CAN OVERRIDE THE AUTOMATIC SECURITY MEASURE BY SETTING THE FLAG BELOW. # CONSIDER THE CONSEQUENCES VERY CAREFULLY BEFORE YOU RUN THIS SSL-ACCESSING # PROXY ON AN INSECURE SERVER!!! $OVERRIDE_SECURITY= 0 ; # If you want to encode the URLs of visited pages so that they don't show # up within the full URL in your browser bar, then use proxy_encode() and # proxy_decode(). These are Perl routines that transform the way the # destination URL is included in the full URL. You can either use # some combination of the example encodings below, or you can program your # own routines. The encoded form of URLs should only contain characters # that are legal in PATH_INFO. This varies by server, but using only # printable chars and no "?" or "#" works on most servers. Don't let # PATH_INFO contain the strings "./", "/.", "../", or "/..", or else it # may get compressed like a pathname somewhere. Try not to make the # resulting string too long, either. # Of course, proxy_decode() must exactly undo whatever proxy_encode() does. # Make proxy_encode() as fast as possible-- it's a bottleneck for the whole # program. The speed of proxy_decode() is not as important. # If you're not a Perl programmer, you can use the example encodings that are # commented out, i.e. the lines beginning with "#". To use them, merely # uncomment them, i.e. remove the "#" at the start of the line. If you # uncomment a line in proxy_encode(), you MUST uncomment the corresponding # line in proxy_decode() (note that "corresponding lines" in # proxy_decode() are in reverse order of those in proxy_encode()). You # can use one, two, or all three encodings at the same time, as long as # the correct lines are uncommented. # Starting in version 2.1beta9, don't call these functions directly. Rather, # call wrap_proxy_encode() and wrap_proxy_decode() instead, which handle # certain details that you shouldn't have to worry about in these functions. # IMPORTANT: If you modify these routines, and if $PROXIFY_SCRIPTS is set # below (on by default), then you MUST modify $ENCODE_DECODE_BLOCK_IN_JS # below!! (You'll need to write corresponding routines in JavaScript to do # the same as these routines in Perl, used when proxifying JavaScript.) # Because of the simplified absolute URL resolution in full_url(), there may # be ".." segments in the default encoding here, notably in the first path # segment. Normally, that's just an HTML mistake, but please tell me if # you see any privacy exploit with it. # Note that a few sites have embedded applications (like applets or Shockwave) # that expect to access URLs relative to the page's URL. This means they # may not work if the encoded target URL can't be treated like a base URL, # e.g. that it can't be appended with something like "../data/foo.data" # to get that expected data file. In such cases, the default encoding below # should let these sites work fine, as should any other encoding that can # support URLs relative to it. sub proxy_encode { my($URL)= @_ ; $URL=~ s#^([\w+.-]+)://#$1/# ; # http://xxx -> http/xxx # $URL=~ s/(.)/ sprintf('%02x',ord($1)) /ge ; # each char -> 2-hex # $URL=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 return $URL ; } sub proxy_decode { my($enc_URL)= @_ ; # $enc_URL=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 # $enc_URL=~ s/([\da-fA-F]{2})/ sprintf("%c",hex($1)) /ge ; $enc_URL=~ s#^([\w+.-]+)/#$1://# ; # http/xxx -> http://xxx return $enc_URL ; } # Encode cookies before they're sent back to the user. # The return value must only contain characters that are legal in cookie # names and values, i.e. only printable characters, and no ";", ",", "=", # or white space. # cookie_encode() is called twice for each cookie: once to encode the cookie # name, and once to encode the cookie value. The two are then joined with # "=" and sent to the user. # cookie_decode() must exactly undo whatever cookie_encode() does. # Also, cookie_encode() must always encode a given input string into the # same output string. This is because browsers need the cookie name to # identify and manage a cookie, so the name must be consistent. # This is not a bottleneck like proxy_encode() is, so speed is not critical. # IMPORTANT: If you modify these routines, and if $PROXIFY_SCRIPTS is set # below (on by default), then you MUST modify $ENCODE_DECODE_BLOCK_IN_JS # below!! (You'll need to write corresponding routines in JavaScript to do # the same as these routines in Perl, used when proxifying JavaScript.) sub cookie_encode { my($cookie)= @_ ; # $cookie=~ s/(.)/ sprintf('%02x',ord($1)) /ge ; # each char -> 2-hex # $cookie=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 $cookie=~ s/(\W)/ '%' . sprintf('%02x',ord($1)) /ge ; # simple URL-encoding return $cookie ; } sub cookie_decode { my($enc_cookie)= @_ ; $enc_cookie=~ s/%([\da-fA-F]{2})/ pack('C', hex($1)) /ge ; # URL-decode # $enc_cookie=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 # $enc_cookie=~ s/([\da-fA-F]{2})/ sprintf("%c",hex($1)) /ge ; return $enc_cookie ; } # If $PROXIFY_SCRIPTS is true, and if you modify the routines above that # encode cookies and URLs, then you need to modify $ENCODE_DECODE_BLOCK_IN_JS # here. Explanation: When proxifying JavaScript, a library of JavaScript # functions is used. In that library are a few JavaScript routines that do # the same as their Perl counterparts in this script. Four of those routines # are proxy_encode(), proxy_decode(), cookie_encode(), and cookie_decode(). # Thus, unfortunately, when you write your own versions of those Perl routines # (or modify what's already there), you also need to write (or modify) these # corresponding JavaScript routines to do the same thing. Put the routines in # this long variable $ENCODE_DECODE_BLOCK_IN_JS, and it will be included in # the JavaScript library when needed. Prefix the function names with # "_proxy_jslib_", as below. # The commented examples in the JavaScript routines below correspond exactly to # the commented examples in the Perl routines above. Thus, if you modify the # Perl routines by merely uncommenting the examples, you can do the same in # these JavaScript routines. (JavaScript comments begin with "//".) # [If you don't know Perl: Note that everything up until the line "EOB" is one # long string value, called a "here document". $ENCODE_DECODE_BLOCK_IN_JS is # set to the whole thing.] $ENCODE_DECODE_BLOCK_IN_JS= <<'EOB' ; function _proxy_jslib_proxy_encode(URL) { URL= URL.replace(/^([\w\+\.\-]+)\:\/\//, '$1/') ; // URL= URL.replace(/(.)/g, function (s,p1) { return p1.charCodeAt(0).toString(16) } ) ; // URL= URL.replace(/([a-mA-M])|[n-zN-Z]/g, function (s,p1) { return String.fromCharCode(s.charCodeAt(0)+(p1?13:-13)) }) ; return URL ; } function _proxy_jslib_proxy_decode(enc_URL) { // enc_URL= enc_URL.replace(/([a-mA-M])|[n-zN-Z]/g, function (s,p1) { return String.fromCharCode(s.charCodeAt(0)+(p1?13:-13)) }) ; // enc_URL= enc_URL.replace(/([\da-fA-F]{2})/g, function (s,p1) { return String.fromCharCode(eval('0x'+p1)) } ) ; enc_URL= enc_URL.replace(/^([\w\+\.\-]+)\//, '$1://') ; return enc_URL ; } function _proxy_jslib_cookie_encode(cookie) { // cookie= cookie.replace(/(.)/g, function (s,p1) { return p1.charCodeAt(0).toString(16) } ) ; // cookie= cookie.replace(/([a-mA-M])|[n-zN-Z]/g, function (s,p1) { return String.fromCharCode(s.charCodeAt(0)+(p1?13:-13)) }) ; cookie= cookie.replace(/(\W)/g, function (s,p1) { return '%'+p1.charCodeAt(0).toString(16) } ) ; return cookie ; } function _proxy_jslib_cookie_decode(enc_cookie) { enc_cookie= enc_cookie.replace(/%([\da-fA-F]{2})/g, function (s,p1) { return String.fromCharCode(eval('0x'+p1)) } ) ; // enc_cookie= enc_cookie.replace(/([a-mA-M])|[n-zN-Z]/g, function (s,p1) { return String.fromCharCode(s.charCodeAt(0)+(p1?13:-13)) }) ; // enc_cookie= enc_cookie.replace(/([\da-fA-F]{2})/g, function (s,p1) { return String.fromCharCode(eval('0x'+p1)) } ) ; return enc_cookie ; } EOB #-------------------------------------------------------------------------- # End of normal user configuration. #-------------------------------------------------------------------------- sub usage { print < \$num_processes, 'max-requests|m=i' => \$max_requests, 'port|p=i' => \$port_arg, 'quiet|q' => \$quiet, 'old-config=s' => \$old_conf_file, # only used for "install" command 'help|h|?' => \$wants_help) or die "bad options-- try '$ZERO -?' for help\n" ; usage() if $wants_help or $cmd eq '' ; # Read config file if it exists, else run config(). Don't do during install. $config_file= File::Spec->catfile($PROXY_DIR, 'cgiproxy.conf') ; my($config_version, $script_location) ; if ($cmd eq 'install') { if (-e $old_conf_file) { ($config_version, $script_location, $PROXY_ID)= read_config_file($old_conf_file) ; } else { config() ; } } else { if (-e $config_file) { ($config_version, $script_location, $PROXY_ID)= read_config_file($config_file) ; } else { config() ; } } # Set this before either set_constants() or one_run() . $ENV{SERVER_PORT}= $USER_FACING_PORT if $USER_FACING_PORT ; # Start the FastCGI process manager. if ($cmd eq 'start-fcgi') { $num_processes||= $FCGI_NUM_PROCESSES ; $max_requests||= $FCGI_MAX_REQUESTS_PER_PROCESS ; # $quiet is messy to support, since FCGI and FCGI::ProcManager write a # lot to STDERR, and we still want to let die() write to our original # STDERR. So, redirect STDERR to /dev/null while saving the original # STDERR for die(). There may be a better approach, but the FCGI # modules aren't well-documented. :( if ($quiet) { no warnings 'once' ; open(OLDERR, '>&', 'STDERR') or die "can't dup STDERR: $!\n" ; use warnings ; open(STDERR, '>', File::Spec->devnull()) or die "can't open /dev/null: $!\n" ; $SIG{__DIE__}= sub { # pointless to "open() or die", since STDERR is still /dev/null open(STDERR, '>&', 'OLDERR') if $quiet and !$^S ; die @_ ; } ; } my $failed ; eval { require FCGI } ; $failed||= $@ ; eval { require FCGI::ProcManager } ; $failed||= $@ ; die <new( { n_processes => $num_processes, max_requests => $max_requests, pm_title => $zero } ) ; my $socket= FCGI::OpenSocket($FCGI_SOCKET, 10) ; my $request= FCGI::Request($STDIN, $STDOUT, \*STDERR, \%ENV, $socket) ; $proc_mgr->pm_manage() ; while ($request->Accept>=0) { $proc_mgr->pm_pre_dispatch() ; # required for FCGI::ProcManager set_constants() unless $HAS_SET_CONSTANTS; eval { one_run() } ; # warn $@ if $@ ; # jsm-- should do anything else here? $proc_mgr->pm_post_dispatch() ; # required for FCGI::ProcManager } FCGI::CloseSocket($socket); # Use the embedded server (daemon). } elsif ($cmd eq 'start-server') { $port_arg||= 443 ; eval { require Net::SSLeay } ; # don't check during compilation die "Running CGIProxy as a daemon requires the Net::SSLeay module.\n" if $@ ; $Net::SSLeay::ssl_version= 12 ; # start with TLS 1.2, then try lower as needed $RUN_METHOD= 'embedded' ; # We need the port before calling set_constants(), which complicates this. my($LOCK_FH, $port, $pid)= create_server_lock('http.run') ; if ($LOCK_FH) { my($HTTPS_LISTEN, $err) ; ($HTTPS_LISTEN, $port, $err)= new_server_socket($port_arg) ; die "Error opening listening socket: $err\n" if $err ; &set_ENV_UNCHANGING($port) ; %ENV= %ENV_UNCHANGING ; # needed for init set_constants() ; if ($RUN_AS_GROUP_ID and $>==0) { $(= $)= $RUN_AS_GROUP_ID ; die "Can't change group ID to $RUN_AS_GROUP_ID: $!" if $! ; } if ($RUN_AS_USER_ID and $>==0) { $<= $>= $RUN_AS_USER_ID ; die "Can't change user ID to $RUN_AS_USER_ID: $!" if $! ; } $pid= spawn_generic_server($HTTPS_LISTEN, $LOCK_FH, \&handle_http_request, 0, 1) ; } my $hostname= hostfqdn() ; $hostname=~ s/\.$// ; # bug in hostfqdn() may leave trailing dot my $portst= $port==443 ? '' : ":$port" ; print "URL of this proxy: https://$hostname$portst/$SECRET_PATH/\n\nProcess ID: $pid\n" ; # This needs to be done periodically, to clear out old cookies and sessions. # Best to put it in a cron job. } elsif ($cmd eq 'purge-db') { set_constants() ; purge_db() ; # Keep this as a separate option, in case the installer wants to install # CPAN modules as root but do other initialization as the user. } elsif ($cmd eq 'install-modules') { set_constants() ; install_modules() ; # Initialize the system as securely as possible. How well we can do this # depends on if we're the root user, script owner, in the script's group, # or none of those. } elsif ($cmd eq 'init') { print <catfile($PROXY_DIR, 'cgiproxy.conf') ; write_config_file($config_file, $installed_script) ; # Add any required cron jobs if able, else tell user. add_cron_jobs($installed_script) if $DB_DRIVER ne '' or $REPORT_USAGE ; phone_home('', $PROXY_VERSION, $REPORT_USAGE) ; print "*** CGIProxy successfully installed. ***\n\n" ; print <', File::Spec->catfile($PROXY_DIR, 'releases', "cgiproxy.$latest_version.tar.gz")) or die "Can't save cgiproxy.$latest_version.tar.gz to $PROXY_DIR/releases/ : $!\n" ; print $tarballfh $tarball ; close($tarballfh) ; # This is used when upgrading, so that new settings can be set. # Everything that might need to be done by the new script is done here. # Note that updating the database is messy, and is not supported yet. It will be. } elsif ($cmd eq 'upgrade-inner') { set_constants() ; create_directory_structure() ; # Run config() and then read_config_file(), so that anything new not set # in the config file will be set by config(). $config_file= File::Spec->catfile($PROXY_DIR, 'cgiproxy.conf') ; config() ; my($config_version, $script_location, $proxy_id)= read_config_file($config_file) ; $script_location||= shift ; $PROXY_ID= $proxy_id || random_string(20, 40) ; write_config_file($config_file, $script_location) ; # uses $PROXY_ID install_modules() ; phone_home($config_version, $PROXY_VERSION, $REPORT_USAGE) ; } elsif ($cmd eq 'uninstall') { set_constants() ; uninstall($script_location, $quiet) ; } elsif ($cmd eq 'configure' or $cmd eq 'config') { die <catfile($PROXY_DIR, 'cgiproxy.conf') ; if (-e $config_file) { read_config_file($config_file) ; } else { config() ; } } # Set this before either set_constants() or one_run() . # This is messy and should be cleaned up. $ENV{SERVER_PORT}= $USER_FACING_PORT if $USER_FACING_PORT ; set_constants() unless $HAS_SET_CONSTANTS ; eval { one_run() } ; # We'd act on $@, but it does what we need below anyway. } EXIT: # Catch-all-- if any handles are still open, close them here. Some error # handling relies on this happening. Also cancel existing alarm. # These are basically for mod_perl, and unneeded if running as a CGI script. close(S) ; untie(*S) ; eval { alarm(0) } ; # use eval{} to avoid failing where alarm() is missing exit unless $RUN_METHOD eq 'mod_perl' ; # mod_perl scripts must not exit #-------------------------------------------------------------------------- # DONE!! #-------------------------------------------------------------------------- #-------------------------------------------------------------------------- # Set or adjust all variables that remain constant for all runs. #-------------------------------------------------------------------------- # set_constants() is somewhat overloaded-- it adjusts config variables, as well as # sets other globals for multiple runs. sub set_constants { my($before_wizard)= @_ ; # hack to accommodate this overloading # First are a few variables that were in the config section, but aren't really # config variables. # This must be an array of languages that run right-to-left. Normally # only the 2-character codes are needed. @RTL_LANG= qw( ar fa ) ; # This lists all MIME types that could identify a script, and which will be # filtered out as well as possible if removing scripts: HTTP responses with # Content-Type: set to one of these will be nixed, certain HTML which links # to one of these types will be removed, style sheets with a type here will # be removed, and other odds and ends. # These are used in matching, so can't contain special regex characters. # This list is also used for the $PROXIFY_SCRIPTS function. # This list contains all script MIME types I know of, but I can't guarantee # it's a complete list. It's largely taken from the examples at # http://www.robinlionheart.com/stds/html4/scripts.html # That page describes only the first four below as valid. # The page at ftp://ftp.isi.edu/in-notes/iana/assignments/media-types/media-types # lists all media (MIME) types registered with the IANA, but unfortunately # many script types (especially proprietary ones) have not registered with # them, and that list doesn't specify which types are script content anyway. @SCRIPT_MIME_TYPES= ('application/x-javascript', 'application/x-ecmascript', 'application/x-vbscript', 'application/x-perlscript', 'application/javascript', 'application/ecmascript', 'text/javascript', 'text/ecmascript', 'text/jscript', 'text/livescript', 'text/vbscript', 'text/vbs', 'text/perlscript', 'text/tcl', 'text/x-scriptlet', 'text/scriptlet', 'application/hta', 'application/x-shockwave-flash', ) ; # All MIME types in @SCRIPT_MIME_TYPES and @OTHER_TYPES_TO_REGISTER will be # "registered". Registration helps the script remember which MIME type is # expected by a page when downloading embedded URLs, e.g. style sheets. Any # MIME types that need special treatment should be listed here if they're not # already in @SCRIPT_MIME_TYPES. # If you write a handler for a new MIME type in proxify_block(), and that type # isn't already listed in @SCRIPT_MIME_TYPES, then add it here. # The Perl code in this program supports up to 64 registered MIME types, but # the JS _proxy_jslib_pack_flags() and _proxy_jslib_unpack_flags() routines # only support 26. Thus, fix the JS code if there are ever more than 26 types. # "x-proxy/xhr" is a special case-- it's used to support the JavaScript class # XMLHttpRequest . Data downloaded through that should not be proxified, # even if it's HTML data; it's proxified later when it's added to a document. # Using the "x-proxy/xhr" type is part of avoiding that first proxification. # "x-proxy/worker" is another special case, only used when starting a Worker # object-- if set, it will prepend jslib to the start of the downloaded # JS content. @OTHER_TYPES_TO_REGISTER= ('text/css', 'x-proxy/xhr', 'x-proxy/worker', 'text/cache-manifest', 'text/html-import') ; # These are MIME types that we *may* try to rewrite in proxify_block(), e.g. # to send all URLs back through this script. If a type isn't on this list, # then we know for certain it should be sent back to the user unchanged, # which saves time. # If you write a handler for a new MIME type in proxify_block(), then add the # type here. # text/html is not on this list because currently it's handled specially. @TYPES_TO_HANDLE= ('text/css', 'x-proxy/worker', 'text/cache-manifest', 'application/x-javascript', 'application/x-ecmascript', 'application/javascript', 'application/ecmascript', 'text/javascript', 'text/ecmascript', 'text/livescript', 'text/jscript', 'application/x-shockwave-flash', 'image/svg+xml', 'application/mathml+xml', ) ; # This is a list of all file extensions that will be disallowed if # $TEXT_ONLY is set. It's an inexact science. If you want to ban # other file extensions, you can add more to this list. Note that # removing extensions from this list won't necessarily allow those # files through, since there are other ways $TEXT_ONLY is implemented, # such as only allowing MIME types of text/* . # The format of this list is one long string, with the extensions # separated by "|". This is because the string is actually used as # a regular expression. Don't worry if you don't know what that means. # Extensions are roughly taken from Netscape's "Helper Preferences" screen # (but that was in 1996). A more complete list might be made from a # mime.types file. $NON_TEXT_EXTENSIONS= 'gif|jpeg|jpe|jpg|tiff|tif|png|bmp|xbm' # images . '|mp2|mp3|wav|aif|aiff|au|snd' # audios . '|avi|qt|mov|mpeg|mpg|mpe' # videos . '|gz|Z|exe|gtar|tar|zip|sit|hqx|pdf' # applications . '|ram|rm|ra|swf' ; # others # Set $RUN_AS_USER, $RUN_AS_USER_ID, $RUN_AS_GROUP, and $RUN_AS_GROUP_ID # based on settings of $RUN_AS_USER and $RUN_AS_GROUP, which can start as # any of empty, numeric IDs, or user/group names. # They default to owner and group of this script file. # Better to set $RUN_AS_USER and $RUN_AS_GROUP explicitly. # Don't do this on Windows. fix_run_as_user_group() if $^O ne 'MSWin32' and !$before_wizard ; # Use local::lib if so configured. # local::lib->import() seems to have a bug where it actually removes the # passed path from @INC, rather than adding it (?). Running import() # twice seems to do what we need. :/ if ($LOCAL_LIB_DIR ne '') { fix_local_lib_dir() ; # ensure is absolute path push(@INC, File::Spec->catdir($LOCAL_LIB_DIR, qw(lib perl5))) ; # to find local::lib eval { require local::lib ; local::lib->import($LOCAL_LIB_DIR) ; local::lib->import($LOCAL_LIB_DIR) ; } ; # ignore errors } # Set %RTL_LANG from @RTL_LANG . @RTL_LANG{@RTL_LANG}= (1) x @RTL_LANG ; # Allow installer to set $DB_DRIVER="MySQL" in config. $DB_DRIVER= 'mysql' if lc($DB_DRIVER) eq 'mysql' ; &HTMLdie("\$DB_NAME must only contain letters, numbers, and \"_\".") if $DB_DRIVER and !($DB_NAME=~ /^\w+\z/) ; $DB_FULLPATH= File::Spec->catfile($PROXY_DIR, 'sqlite', $DB_NAME) if $DB_DRIVER eq 'SQLite' ; if ($DB_SERVER ne '') { my($db_host, $db_port)= $DB_SERVER=~ /\[/ ? $DB_SERVER=~ /^\[([^\]]*)\]:(.*)/ : split(/:/, $DB_SERVER) ; $db_host= $db_host ne '' ? ";host=$db_host" : '' ; $db_port= $db_port ne '' ? ";port=$db_port" : '' ; ($DB_HOSTPORT= $db_host . $db_port)=~ s/^;// ; } else { $DB_HOSTPORT= '' ; } # These are used in rfc1123_date() and date_is_after(). @MONTH= qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec) ; @WEEKDAY= qw(Sun Mon Tue Wed Thu Fri Sat Sun) ; %UN_MONTH= map { lc($MONTH[$_]), $_+1 } 0..$#MONTH ; # look up by month name, 1-based @ALPHANUMERIC= (0..9, 'a'..'z', 'A'..'Z', '_') ; # matches \w in patterns # Create the sets of regular expressions we'll need if we proxify scripts. # So far, the only script type we proxify is JavaScript. &set_RE_JS if $PROXIFY_SCRIPTS ; # Next, make copies of any constant environment variables, and fix as needed. # SERVER_PORT and SCRIPT_NAME will be constant, and are used in several places. # Besides, we need SCRIPT_NAME fixed before setting $THIS_SCRIPT_URL. # SCRIPT_NAME should have a leading slash, but the old CGI "standard" from # NCSA was unclear on that, so some servers didn't give it a leading # slash. Here we ensure it has a leading slash. # Exception: If SCRIPT_NAME is empty, then we're using a daemon, so leave it empty. # Apache has a bug where SCRIPT_NAME is wrong if the PATH_INFO has "//" in it; # it's set to the script name plus all of PATH_INFO up until its final "//". # To work around this, truncate SCRIPT_NAME at the first place it matches $0. # PATH_INFO is also changed to collapse all multiple slashes into a single # slash, which is not worked around here. This bug should be fixed in # Apache 2.0.55 and later. # Some servers provide $0 as a complete path rather than just the filename, # so extract the filename. $ENV{SCRIPT_NAME}=~ s#^/?#/# if $ENV{SCRIPT_NAME} ne '' ; if ($ENV{SERVER_SOFTWARE}=~ /^Apache\b/i) { my($zero)= $ZERO=~ m#([^/]*)$# ; ($ENV{SCRIPT_NAME})= $ENV{SCRIPT_NAME}=~ /^(.*?\Q$zero\E)/ if $zero ne '' ; } $ENV_SERVER_PORT= $ENV{SERVER_PORT} ; $ENV_SCRIPT_NAME= $ENV{SCRIPT_NAME} ; # The nginx server sets SCRIPT_NAME to the entire request-URI, so fix it. # Must do this only on $ENV_SCRIPT_NAME and not $ENV{SCRIPT_NAME}, because # later we'll need the latter to get PATH_INFO. :P if ($ENV{SERVER_SOFTWARE}=~ /^nginx\b/i) { if ($RUN_METHOD eq 'fastcgi') { $ENV_SCRIPT_NAME= '/' . $SECRET_PATH ; } else { my($zero)= $ZERO=~ m#([^/]*)$# ; ($ENV_SCRIPT_NAME)= $ENV_SCRIPT_NAME=~ /^(.*?\Q$zero\E)/ if $zero ne '' ; } } # If we're running as the embedded server, use $SECRET_PATH . $ENV_SCRIPT_NAME= '/' . $SECRET_PATH if $RUN_METHOD eq 'embedded' or $RUN_METHOD eq 'fastcgi' ; # Next, adjust config variables as needed, or create any needed constants from # them. # For the external tests, create hashes of parsed URLs if the tests are CGI calls. # Note that the socket names must each be unique! if ($USER_IP_ADDRESS_TEST=~ m#^(https?)://([^/?:]*):?(\d*)(.*)#i) { my $port= $3 ne '' ? $3 : lc($1) eq 'https' ? 443 : 80 ; @{$USER_IP_ADDRESS_TEST_H}{qw(scheme host port path socket open)}= (lc($1), lc($2), $port, $4, 'S_USERTEST', 0) ; } if ($DESTINATION_SERVER_TEST=~ m#^(https?)://([^/?:]*):?(\d*)(.*)#i) { my $port= $3 ne '' ? $3 : lc($1) eq 'https' ? 443 : 80 ; @{$DESTINATION_SERVER_TEST_H}{qw(scheme host port path socket open)}= (lc($1), lc($2), $port, $4, 'S_DESTTEST', 0) ; } # If $RUNNING_ON_SSL_SERVER is '', then guess based on SERVER_PORT. # Don't do this during installation. $RUNNING_ON_SSL_SERVER= ($ENV_SERVER_PORT!=80) if $RUNNING_ON_SSL_SERVER eq '' and $RUN_METHOD ne '' ; # Or, if we're a daemon, then it's always true. $RUNNING_ON_SSL_SERVER= 1 if $RUN_METHOD eq 'embedded' ; # $DB_DRIVER is required for $USE_DB_FOR_COOKIES to be true. $USE_DB_FOR_COOKIES= 0 unless $DB_DRIVER ne '' ; # Set this constant based on whether the server is IIS, because we have to # test it later for every run to work around a bug in IIS. A constant here # saves time when using mod_perl. $RUNNING_ON_IIS= ($ENV{'SERVER_SOFTWARE'}=~ /IIS/) ; # FastCGI doesn't support NPH scripts. :P $NOT_RUNNING_AS_NPH= 1 if $RUN_METHOD eq 'fastcgi' ; # Create @NO_PROXY from $NO_PROXY for efficiency. @NO_PROXY= split(/\s*,\s*/, $NO_PROXY) ; # Base64-encode $PROXY_AUTH and $SSL_PROXY_AUTH if they're not encoded already. $PROXY_AUTH= &base64($PROXY_AUTH) if $PROXY_AUTH=~ /:/ ; $SSL_PROXY_AUTH= &base64($SSL_PROXY_AUTH) if $SSL_PROXY_AUTH=~ /:/ ; # Allow $SOCKS_PROXY to be set to just port number, with or without leading ":". $SOCKS_PROXY= (substr($SOCKS_PROXY, 0, 1) eq ':') ? "127.0.0.1$SOCKS_PROXY" : $SOCKS_PROXY=~ /^\d+$/ ? "127.0.0.1:$SOCKS_PROXY" : $SOCKS_PROXY ; # Guarantee URLs in @PROXY_GROUP have no trailing slash. foreach (@PROXY_GROUP) { s#/$## } # Create $NO_CACHE_HEADERS depending on $MINIMIZE_CACHING setting; it is placed # in every response. Note that in all the "here documents" we use for error # messages, it has to go on the same line as another header to avoid a blank # line in the response. $NO_CACHE_HEADERS= $MINIMIZE_CACHING ? "Cache-Control: no-store\015\012Pragma: no-cache\015\012" : '' ; # Canonicalize all MIME types to lowercase. for (@SCRIPT_MIME_TYPES) { $_= lc } for (@OTHER_TYPES_TO_REGISTER) { $_= lc } # Create @ALL_TYPES and %MIME_TYPE_ID, which are inverses of each other. # This is useful e.g. to identify the MIME type expected in a given download, # in a one-character flag. That's why we limit this to 64 types for now. # $ALL_TYPES[0] is '', so we can test e.g. "if $MIME_TYPE_ID{$id} ..." . @ALL_TYPES= ('', @SCRIPT_MIME_TYPES, @OTHER_TYPES_TO_REGISTER) ; &HTMLdie("Too many MIME types to register.") if @ALL_TYPES > 64 ; @MIME_TYPE_ID{@ALL_TYPES}= 0..$#ALL_TYPES ; # Regex that matches a script MIME type. $SCRIPT_TYPE_REGEX= '(' . join("|", @SCRIPT_MIME_TYPES) . ')' ; # Regex that tells us whether we handle a given MIME type. for (@TYPES_TO_HANDLE) { s/(\W)/\\$1/g } $TYPES_TO_HANDLE_REGEX= '(' . join("|", @TYPES_TO_HANDLE) . ')' ; # Only need to run this routine once $HAS_SET_CONSTANTS= 1 ; # End of initialization of constants. } # sub set_constants { #-------------------------------------------------------------------------- # Global constants are now set. #-------------------------------------------------------------------------- #-------------------------------------------------------------------------- # Do any initialization that is required for every run. #-------------------------------------------------------------------------- # What used to be the "main" code has now been divided up between init() and # one_run() . sub one_run { # OK, let's time this thing #my $starttime= time ; #my($sutime,$sstime)= (times)[0,1] ; # This is needed to run an NPH script under mod_perl. # Other stuff needed for mod_perl: # must use at least Perl 5.004, or STDIN and STDOUT won't behave correctly; # cannot use exit(); # must initialize or reset all vars; # regex's with /o option retain state between calls, so be careful; # typeglobbing of *STDIN doesn't work, so must pass filehandles as strings. local($|)= 1 ; # In mod_perl, global variables are retained between calls, so they must # be initialized correctly. In this program, (most) UPPER_CASE variables # are persistent constants, i.e. they aren't changed after they're # initialized above (in the $HAS_BEGUN block). We also assume that no # lower_case variables are set before here. It's a little hacky and possibly # error-prone if user customizations don't follow these conventions, but it's # fast and simple. # So, if you're using mod_perl and you make changes to this script, don't # modify existing UPPER_CASE variables after the $HAS_BEGUN block above, # don't set lower_case variables before here, and don't use UPPER_CASE # variables for anything that will vary from run to run. # BUG IN PERL 5.24.1 and 5.24.2: "reset" can cause a segmentation fault, untrappable by # eval/die . Which other versions of Perl have this bug? I've heard # conflicting reports. 5.22.1 also, so assume all 5.22.x versions have it. if ($]==5.024001 or $]==5.024002 or int($]*1000)==5022) { no strict 'refs' ; /^[a-z]/ && (undef $$_, undef @$_, undef %$_) foreach sort keys %:: ; # like "reset 'a-z'", but slower } else { reset 'a-z' ; } $URL= '' ; # (almost) only uppercase variable that varies from run to run $now= time ; # have to do it a second time here, after reset :P $csp_is_supported= &csp_is_supported() ; # Set $THIS_HOST to the best guess how this script was called-- use the # Host: request header if available; otherwise, use SERVER_NAME. # We don't bother with a $THIS_PORT, since it's more reliably set to the port # through which the script was called. SERVER_NAME is much more likely to # be different from the hostname that the user sees, since one server may # handle many domains or have many hostnames. # This has to be calculated every run, since there may be multiple hostnames. if ($ENV{'HTTP_HOST'} ne '') { ($THIS_HOST)= $ENV{'HTTP_HOST'}=~ /\[/ ? $ENV{'HTTP_HOST'}=~ m#^(?:[\w+.-]+://)?\[([^\]]*)\]# : $ENV{'HTTP_HOST'}=~ m#^(?:[\w+.-]+://)?([^:/?]*)# ; $THIS_HOST= $ENV{'SERVER_NAME'} if $THIS_HOST eq '' ; } else { $THIS_HOST= $ENV{'SERVER_NAME'} ; } # Build the constant $THIS_SCRIPT_URL from environment variables. Only include # SERVER_PORT if it's not 80 (or 443 for SSL). $THIS_SCRIPT_URL= $RUNNING_ON_SSL_SERVER ? 'https://' . $THIS_HOST . ($ENV_SERVER_PORT==443 ? '' : ':' . $ENV_SERVER_PORT) . $ENV_SCRIPT_NAME : 'http://' . $THIS_HOST . ($ENV_SERVER_PORT==80 ? '' : ':' . $ENV_SERVER_PORT) . $ENV_SCRIPT_NAME ; # This script uses whatever version of HTTP the client is using. So far # only 1.0 and 1.1 are supported. ($HTTP_VERSION)= $ENV{'SERVER_PROTOCOL'}=~ m#^HTTP/(\d+\.\d+)#i ; $HTTP_VERSION= '1.1' unless $HTTP_VERSION=~ /^1\.[01]$/ ; # Hack to support non-NPH installation-- luckily, the format of a # non-NPH response is almost exactly the same as an NPH response. # The main difference is the first word in the status line-- something # like "HTTP/1.x 200 OK" can be simulated with "Status: 200 OK", as # long as the server supports the Status: CGI response header. So, # we set that first word to either "HTTP/1.x" or "Status:", and use # it for all responses throughout the script. # NOTE: This is not the only difference between an NPH and a non-NPH # response. For example, the Location: header has different semantics # between the two types of responses. This hack is only an approximation # that we hope works most of the time. It's better to install the script # as an NPH script if possible (which it almost always is). # Technically, the HTTP version in the response is supposed to be the highest # version supported by the server, even though the rest of the response may # be in the format of an earlier version. Unfortunately, CGI scripts do # not have access to that value; it's a hole in the CGI standard. $HTTP_1_X= $NOT_RUNNING_AS_NPH ? 'Status:' : "HTTP/$HTTP_VERSION" ; # Fix submitted by Alex Freed: Under some unidentified conditions, # instances of nph-proxy.cgi can hang around for many hours and drag the # system. So until we figure out why that is, here's a 10-minute timeout. # Please write me with any insight into this, since I can't reproduce the # problem. Under what conditions, on what systems, does it happen? # 9-9-1999: One theory is that it's a bug in older Apaches, and is fixed by # upgrading to Apache 1.3.6 or better. Julian Haight reports seeing the # same problem with other scripts on Apache 1.3.3, and it cleared up when # he upgraded to Apache 1.3.6. Let me know if you can confirm this. # alarm() is missing on some systems (such as Windows), so use eval{} to # avoid failing when alarm() isn't available. # As of version 2.1: We now only do this if we're running on Apache that is # earlier than version 1.3.6, to allow large downloads for everyone else. if ($ENV{'SERVER_SOFTWARE'}=~ m#^Apache/(\d+)\.(\d+)(?:\.(\d+))?#i) { if (($1<=>1 or $2<=>3 or $3<=>6) < 0) { $SIG{'ALRM'} = \&timeexit ; eval { alarm(600) } ; # use where it works, ignore where it doesn't } } # Exit upon timeout. If you wish, add code to clean up and log an error. sub timeexit { goto ONE_RUN_EXIT } # Fix any environment variables that the server may have set wrong. # Note that some constant environment variables are copied to variables above, # and fixed there. # The IIS server doesn't set PATH_INFO correctly-- it sets it to the entire # request URI, rather than just the part after the script name. So fix it # here if we're running on IIS. Thanks to Dave Moscovitz for the info! $ENV{'PATH_INFO'} =~ s/^$ENV_SCRIPT_NAME// if $RUNNING_ON_IIS ; # The nginx server also doesn't set PATH_INFO, or even SCRIPT_NAME, correctly-- # it sets SCRIPT_NAME to the entire request URI, and PATH_INFO to nothing. So fix it. # $ENV_SCRIPT_NAME has earlier been set correctly. ($ENV{PATH_INFO}= $ENV{SCRIPT_NAME})=~ s/^\Q$ENV_SCRIPT_NAME\E// if $ENV{SERVER_SOFTWARE}=~ /^nginx\b/i and $ENV{PATH_INFO} eq '' ; # PATH_INFO may or may not be URL-encoded when we get it; it seems to vary # by server. This script assumes it's still encoded. Thus, if it's not, # we need to re-encode it. # The only time this seems to come up is when spaces are in URLs, correctly # represented in the URL as %20 but decoded to " " in PATH_INFO. Thus, # this hack only focuses on space characters. It's a hack that I'm not at # all comfortable with. :P # Very yucky business, this encoding thing. if ($ENV{'PATH_INFO'}=~ / /) { $ENV{'PATH_INFO'} =~ s/%/%25/g ; $ENV{'PATH_INFO'} =~ s/ /%20/g ; } # Protect with $SECRET_PATH when appropriate. if ($RUN_METHOD eq 'embedded' and !($ENV{'PATH_INFO'}=~ s#^/\Q$SECRET_PATH\E(/|$)#$1#)) { select((select($STDOUT), $|=1)[0]) ; # unbuffer the socket print $STDOUT "HTTP/1.1 404 Not Found\015\012\015\012" ; die "exiting" ; } # Copy often-used environment vars into scalars, for efficiency $env_accept= $ENV{'HTTP_ACCEPT'} || '*/*' ; # may be modified later # PATH_INFO consists of path segments of the language and flags, followed by the encoded # target URL. For example, PATH_INFO might be something like # "/en/20/http/www.example.com". The actual format of the flag segment # is defined in the routine pack_flags(). # Thanks to Mike Harding for the idea of using another flag for the # $is_in_frame parameter, instead of using two parallel scripts. # Extract flags and encoded URL from PATH_INFO. ($lang, $packed_flags, $encoded_URL)= $ENV{'PATH_INFO'}=~ m#^/([^/]*)/?([^/]*)/?(.*)# ; $lang=~ s/[^\w-]//g ; # language codes can only have alphanumeric and "-" $lang= $DEFAULT_LANG if $lang eq '' ; # Set "dir" attribute based on %RTL_LANG . $dir= $RTL_LANG{$lang} ? ' dir="rtl"' : '' ; # Set all $e_xxx variables ("effective-xxx") and anything else from flag # segment of PATH_INFO. If user config is not allowed or if flag segment # is not present, then set $e_xxx variables from hard-coded config variables # instead (but still set anything else as needed from PATH_INFO). if ( $ALLOW_USER_CONFIG && ($packed_flags ne '') ) { ($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, $is_in_frame, $expected_type)= &unpack_flags($packed_flags) ; } else { # $is_in_frame is set in any case. It indicates whether the current # request will be placed in a frame. ($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, $is_in_frame, $expected_type)= ($REMOVE_COOKIES, $REMOVE_SCRIPTS, $FILTER_ADS, $HIDE_REFERER, $INSERT_ENTRY_FORM, (&unpack_flags($packed_flags))[5..6] ) ; } # Set any other $e_xxx variables not from flag segment [none currently]. # Flags are now set, and $encoded_URL now contains only the encoded target URL. # Create a one-flag test for whether we're inserting anything into THIS page. # This must happen after user flags are read, just above. $doing_insert_here= !$is_in_frame && ( $e_insert_entry_form || ($INSERT_FILE ne '') || ($INSERT_HTML ne '') ) ; # One user reported problems with binary files on certain other OS's, and # this seemed to fix it. Supposedly, either this or the "binmode S" # statements below the newsocketto() calls work, or all; I'm putting all in. # Tell me anything new you figure out about this. binmode $STDOUT ; #-------------------------------------------------------------------------- # parse URL, make checks, and set various globals #-------------------------------------------------------------------------- # Calculate $url_start for use later in &full_url() and elsewhere. It's an # integral part of &full_url(), placed here for speed, similar to the # variables set in &fix_base_vars. # $url_start is the first part of every proxified URL. A complete proxified # URL is made by appending &wrap_proxy_encode(URL) (and possibly a #fragment) to # $url_start. $url_start normally consists of the current script's URL # (or one from @PROXY_GROUP), plus a flag segment in PATH_INFO, complete # with trailing slash. For example, a complete $url_start might be # "http://www.example.com/path/nph-proxy.cgi/010110A/" . # $url_start_inframe and $url_start_noframe are used to force the frame flag # on or off, for example when proxifying a link that causes frames to be # entered or exited. Otherwise, most links inherit the current frame state. # $script_url is used later for Referer: support, and whenever a temporary # copy of $url_start has to be generated. # In earlier versions of CGIProxy, $url_start was called $this_url, which is # really what it was originally. Its semantics had drifted somewhat since # then, so they have been cleaned up, and $url_start is now more descriptive. # Set $url_start to a random element of @PROXY_GROUP, if that is set. if (@PROXY_GROUP) { # srand is automatically called in Perl 5.004 and later. It might be # desirable to seed based on the URL, so that multiple requests for # the same URL go through the same proxy, and may thus be cached. #srand( unpack('%32L*', $ENV{'PATH_INFO'}) ) ; # seed with URL+flags $script_url= $PROXY_GROUP[ rand(scalar @PROXY_GROUP) ] ; } else { $script_url= $THIS_SCRIPT_URL ; } # Create $url_start and any needed variants: "$script_url/flags/" $url_start_inframe= url_start_by_flags($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, 1, '') ; $url_start_noframe= url_start_by_flags($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, 0, '') ; $url_start= $is_in_frame ? $url_start_inframe : $url_start_noframe ; # If there's no $encoded_URL, then start a browsing session. &show_start_form() if $encoded_URL eq '' ; # Decode the URL. $URL= &wrap_proxy_decode($encoded_URL) ; # Set the query string correctly, from $ENV{QUERY_STRING} and what's already # in $URL. # The query string may exist either within the encoded URL or in the containing # URL, as $ENV{QUERY_STRING}. If the former, then the query string was # (definitely?) in a referenced URL, while the latter most likely implies a # GET form input. # With Flash apps adding e.g. "?range=100-1000" to proxified URLs, both # query strings may be valid, so append $ENV{'QUERY_STRING'} to the end # of the URL appropriately. # Note that Netscape does not pass any query string data that is part of the # URL in the
attribute, which is probably correct behaviour. # For this program to act exactly the same, it would need to strip the # query string when updating all URLs, way below. $URL.= ($URL=~ /\?/ ? '&' : '?') . $ENV{'QUERY_STRING'} if $ENV{'QUERY_STRING'} ne '' ; # Parse the URL, using a regex modelled from the one in RFC 2396 (URI syntax), # appendix B. # This assumes a hierarchical scheme; it won't work for e.g. mailto: # "authority" is the combination of host, port, and possibly other info. # Note that $path here will also contain any query component; it's more like # the request URI. # Note that $URL is guaranteed to be an absolute URL with no "#" fragment, # though this does little error-checking. Note also that the old ";" # parameters are now included in the path component. ($scheme, $authority, $path)= ($URL=~ m#^([\w+.-]+)://([^/?]*)(.*)$#i) ; $scheme= lc($scheme) ; $path= "/$path" if $path!~ m#^/# ; # if path is '' or contains only query # If so configured, handle session ID cookies. # This all has to be done before calling xproxy() below, because some is # used for cookie management. if ($USE_DB_FOR_COOKIES) { # Attempt to get session ID cookies from HTTP_COOKIE . # $session_id may not exist (it's a session cookie only), so accept # either 2 or 3 session cookies. ($scookie_names, $session_usage, $session_id_persistent, $session_id)= get_session_cookies() ; # If we didn't get any session ID cookies, generate non-colliding cookie names. if (!@$scookie_names) { do { $scookie_names= [ random_string(1,3), random_string(1,3), random_string(1,3) ] ; } until ($scookie_names->[0] ne $scookie_names->[1] and $scookie_names->[0] ne $scookie_names->[2] and $scookie_names->[1] ne $scookie_names->[2]) ; @$scookie_names= sort @$scookie_names ; } my $secure_clause= $RUNNING_ON_SSL_SERVER ? ' secure;' : '' ; connect_to_db() ; # Having this string in the USER_AGENT always indicates a mobile browser, # I think (unless of course the user has changed it). # This only needs to be set when creating sessions, since sessions are # unique per browser. my $is_mobile= $ENV{HTTP_USER_AGENT}=~ /Mobi/ ? 1 : 0 ; # Now that we're using a database, we need session IDs. 20 random alphanumeric # characters means one collision in roughly 10^18 simultaneous uses. # One session ID is itself a session-length cookie, and is used to store # session cookies and anything else we need to expire when the session ends; # another cookie is persistent, and is used to store all persistent cookies; # another cookie is to measure unique users by month, and has no cookies # associated with it but lasts two months. # verify_session() verifies that the session exists and hasn't expired. if ($session_usage=~ /^\w{20,50}$/ and verify_session($session_usage)) { update_session_record($session_usage, $lang) ; } else { $session_usage= random_string(20,50) ; create_session_record($session_usage, 1, $is_mobile, $lang) ; } if ($session_id_persistent=~ /^\w{20,50}$/ and verify_session($session_id_persistent)) { update_session_record($session_id_persistent, $lang) ; } else { $session_id_persistent= random_string(20, 50) ; create_session_record($session_id_persistent, 0, $is_mobile, $lang) ; } # Set usage and persistent session ID cookies with every response. # The usage session ID lasts 2 months after last use, so that we can count # unique users every month. # The persistent session ID lasts one hour after last use (should time be configurable?). # For cookies, the domain defaults to the origin server, i.e. this proxy server. $session_cookies= "Set-Cookie: $scookie_names->[0]=$session_usage; expires=" . &rfc1123_date($now+86400*62, 1) . "; path=$ENV_SCRIPT_NAME/;$secure_clause HttpOnly\015\012" ; $session_cookies.= "Set-Cookie: $scookie_names->[1]=$session_id_persistent; expires=" . &rfc1123_date($now+3600, 1) . "; path=$ENV_SCRIPT_NAME/;$secure_clause HttpOnly\015\012" ; # Create and return non-persistent session ID cookie, if needed. if ($session_id=~ /^\w{20,50}$/ and verify_session($session_id)) { update_session_record($session_id, $lang) ; } else { # If we didn't get the session cookie back, generate a new one whose # name sorts after the other names. # Hacky and sometimes inefficient. # Note that there will always be at least one possible string gt $scookie_names->[1], # since it was originally sorted as before $scookie_names->[2] . $scookie_names->[2]= random_string(1,3) until $scookie_names->[2] gt $scookie_names->[1] ; $session_id= random_string(20, 50) ; $session_cookies.= "Set-Cookie: $scookie_names->[2]=$session_id; " . "path=$ENV_SCRIPT_NAME/;$secure_clause HttpOnly\015\012" ; create_session_record($session_id, 0, $is_mobile, $lang) ; } } # Magic here-- if $URL uses special scheme "x-proxy", immediately call the # general-purpose xproxy() routine. &xproxy($URL) if $scheme eq 'x-proxy' ; &return_iframe_wrapper_doc($URL) if !$is_in_frame ; # Generate the expected (?) network error for these URLs. goto ONE_RUN_EXIT if $scheme eq 'chrome-extension' ; # Set $is_html if $path (minus query) ends in .htm or .html . # MSIE has a bug (and privacy hole) whereby URLs with QUERY_STRING ending # in .htm or .html are mistakenly treated as HTML, and thus could have # untranslated links, # or tags. This is most likely what the HTML author expects # anyway, though it violates the HTML spec. In this script, we should # over-proxify rather than under-proxify, so we'll end those blocks on # those end tags as browsers (erroneously) do. # Worse, Konqueror allows the string "" inside JS literal strings, # i.e. doesn't end the script block on them. Netscape does end the block # there, and both browsers end style blocks on embedded strings. # Because it's a given that we can't anonymize scripts completely, but # we do want to anonymize HTML completely, we'd rather accidentally # treat script content as HTML than the other way around. So err on # ending the " regardless of whether it's in a string. # (We'd end on " blocks, conditional comments, # intrinsic event attributes ("on___" attributes), script macros, and # the MSIE-specific "dynamic properties". These can be removed or # proxified, depending on the settings of $scripts_are_banned_here and # $PROXIFY_SCRIPTS. # Script content can also exist elsewhere when its MIME type is explicitly # given (for example, in a ') ; # Used for