<?php
#############################################################################
# A simple robots.txt honeypot system.
#
# Copyright March 2008 by Christian J. Robinson <infynity at onewest dot net>
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details:
# HTML: http://www.gnu.org/copyleft/gpl.html
# Text: http://www.gnu.org/copyleft/gpl.txt
#############################################################################
# $Id: robots_txt_abuse.php,v 1.4 2008/04/02 09:25:55 infynity Exp $
#
# $Log: robots_txt_abuse.php,v $
# Revision 1.4 2008/04/02 09:25:55 infynity
# *** empty log message ***
#
# Revision 1.3 2008/03/31 13:35:14 infynity
# Resolve the IP to a hostname for the email
#
# Revision 1.2 2008/03/31 12:45:33 infynity
# Tweak usage information
#
#############################################################################
### Configuration:
# This file /must/ be writable by the Apache process:
$abusefile = 'private/robots_abuse';
# This variable must be set, but it can be set to a blank value to suppress
# email notifications:
$email = '';
# This variable also must be set. It's the number of times, plus one, that an
# IP must try to access the honeypot URIs before the message changes to "you
# will now be blocked":
$grace = 2;
### END Configuration
/* Usage Information:
In order to make this functional, you need to put some URIs that you *NEVER*
reference *ANYWHERE* else into your site's /robot.txt file, such as:
# Catch underhanded crawling:
Disallow: /foobarbaz.html
Disallow: /bazbarfoo.php
Disallow: /fooquux/
Then you place lines like these in your .htaccess file in the same directory
as the robots.txt file:
# Catch underhanded crawling (abuse of robots.txt):
RewriteEngine on
Rewriterule ^foobarbaz /robots_txt_abuse.php
Rewriterule ^bazbarfoo /robots_txt_abuse.php
Rewriterule ^fooquux /robots_txt_abuse.php
Note that the patterns being matched are only partially complete, which will
work fine. Be sure you change "/robots_txt_abuse.php" to the actual location
of this script.
Once you decide to block a site (this script doesn't do it automatically,
despite the implication that it does in the messages it sends), you can place
a rule like this in the same .htaccess mentioned above:
# ...And block those that are repeat offenders:
RewriteCond %{REMOTE_ADDR} =111.111.111.111 [OR]
RewriteCond %{REMOTE_ADDR} =222.222.222.222 [OR]
RewriteCond %{REMOTE_ADDR} =333.333.333.333
RewriteCond %{REQUEST_URI} !/errors/
RewriteRule "" - [F,L]
(Be sure to change "111.111.111.111", etc. to the actual IPs you intend to
block.)
This will cause your Apache server to send a 403 (access forbidden) response
to every attempt to access any page by the indicated IP addresses. Note the
line that checks for REQUEST_URI not matching "/errors/"; you should change
this to whatever directory your error documents are in so you don't get a 403
error on the error document itself.
You can also block based on REMOTE_HOST, HTTP_USER_AGENT, etc. and use regular
expressions for matching. See:
http://httpd.apache.org/docs/mod/mod_rewrite.html#RewriteCond
Warning: Never use this script's filename as one of your honeypot lines--it
won't work.
*/
function log_addr($addr, $abusefile)
{
$abuse = fopen($abusefile, 'a');
fwrite($abuse, $addr . "\n");
fclose($abuse);
}
function count_offenses($addr, $abusefile)
{
$abuse = fopen($abusefile, 'r');
$count = 0;
while($line = fgets($abuse, 1024))
if (rtrim($line) === $addr)
++$count;
fclose($abuse);
return $count;
}
function email_warn($who, $address, $count, $grace, $uri)
{
if ($who === '') return;
$headers = "From: nobody <nobody@dev.null>\n";
$subject = 'Abuse of robots.txt';
$hostname = gethostbyaddr($address);
if ($hostname !== $address)
$hostname = " ($hostname)";
else
$hostname = '';
$body = "The IP $address$hostname has been observed to be accessing a file" .
" that is *only* referenced to in your robots.txt.\n\n" .
"This has happened from this IP $count time";
if ($count != 1) $body .= 's';
if ($count > $grace)
$body .= ", which *exceeds* the grace value of $grace";
$body .= ".\n\nThe attempted access was:\n$uri\n";
$body .= "\n\n*** THIS IS AN AUTOMATICALLY GENERATED MESSAGE -- DO NOT REPLY ***\n";
$body = wordwrap($body);
mail($who, $subject, $body, $headers);
}
function print_source()
{
global $email;
$file = join('', file(getcwd() . $_SERVER['SCRIPT_NAME']));
$file = preg_replace(
'/(\$email\s*=\s*(["\']))'.quotemeta($email).'(\2;)/',
'\1\3',
$file, 1
);
header("Content-Type: text/html; charset=UTF-8");
print '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"'."\n".
' "http://www.w3.org/TR/html4/loose.dtd">'."\n";
print "<HTML>\n<HEAD>\n".
"<TITLE>Source of $_SERVER[SCRIPT_NAME]</TITLE>\n".
"</HEAD>\n<BODY BGCOLOR=\"white\">\n";
highlight_string($file);
print "\n</BODY>\n</HTML>\n";
exit;
}
if ($_SERVER['REQUEST_URI'] === $_SERVER['SCRIPT_NAME'])
print_source();
$warning = <<<EOF
Your <A HREF="http://en.wikipedia.org/wiki/IP_address">IP address</A>
($_SERVER[REMOTE_ADDR]) has been logged. If this IP appears in the log more
than once, you may be subject to a permanent block.
EOF;
$blocked = <<<EOF
Your <A HREF="http://en.wikipedia.org/wiki/IP_address">IP address</A>
($_SERVER[REMOTE_ADDR]) has shown up in the log multiple times, which means
you've been repeatedly warned. You will now be permanently blocked.
EOF;
log_addr($_SERVER['REMOTE_ADDR'], $abusefile);
$count = count_offenses($_SERVER['REMOTE_ADDR'], $abusefile);
if ($count > $grace)
$which = $blocked;
else
$which = $warning;
email_warn(
$email,
$_SERVER['REMOTE_ADDR'],
$count,
$grace,
"http://$_SERVER[SERVER_NAME]$_SERVER[REQUEST_URI]"
);
header("Content-Type: text/html; charset=iso-8859-1");
echo <<<EOF
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<HTML>
<HEAD>
<TITLE>WARNING: You are abusing this site's robots.txt!</TITLE>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
</HEAD>
<BODY>
<H1><FONT COLOR="#DD0000">WARNING!</FONT> You are abusing this site's robots.txt!</H1>
<P>
The URI which you tried to access (<TT>$_SERVER[REQUEST_URI]</TT>) is
referenced <I>only</I> in <A HREF="/robots.txt">this site</a>'s robots.txt
file. This means that you almost certainly found this page by following
that reference, which is a gross misuse of the purpose of
<A HREF="http://www.robotstxt.org/">robots.txt</A>.
</P>
<P>
$which
</P>
</BODY>
</HTML>\n
EOF;
# vim: set ts=2 sw=2 et: ?>