GoogleBot RSS feed script

The Python script

This python script will go through your apache access log and make an rss feed of the latest crawls by GoogleBot, or any other string you want to keep an eye on. Change the maximum number of entries displayed to suit your needs. The time_adjust variable will add the amount in hours to the displayed log time. This is useful if the server is not in your timezone. It might be a good idea to password protect this feed, you probably don't want everyone to see it.

python script: googlebot.rss

#!/usr/bin/python

import os, cgi, time, re, string
#import cgitb; cgitb.enable()

# Location of the apache access log file on server
logfile = '/var/log/httpd/access_log'

#host without slashes or http:// (will only be used if environment var is not available
host = 'www.floatinginspace.za.org'

# Maximum number of entries
max = 20

# Case insensitive string to filter for in logs
search_string = 'googlebot/'

# adjust time from GMT 
time_adjust = '2'

RSS_TEMPLATE = """<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet type="text/css" href="http://www.floatinginspace.za.org/rss.css" ?>
<rss version="2.0">
    <channel>
        <title>GoogleBot log entries for %(HOST)s</title>
        <link>http://%(HOST)s/</link>
        <description>Latest crawls by GoogleBot</description>
        <lastBuildDate>%(DATE)s</lastBuildDate>
        <language>en</language>
%(ITEMS)s

    </channel>
</rss>"""

ITEM_TEMPLATE = '''
        <item>
            <title>%(URL)s on %(LOCALDATE)s</title>
            <link>http://%(HOST)s%(URL)s</link>
            <pubDate>%(DATE)s</pubDate>
            <description>%(DATA)s</description>
        </item>'''

LOG_TIME_FORMAT = "%d/%b/%Y:%H:%M:%S"

datereg = re.compile('\[([0-9]+/.../[0-9][0-9][0-9][0-9]:[0-9][0-9]:[0-9][0-9]:[0-9][0-9]) (.[0-9][0-9][0-9][0-9])\]')
logreg = re.compile('"(GET|HEAD) (\S+) \S+" \d+')

def log2local(str, adj):
    apachedate = datereg.search(str)
    dated = apachedate.group(1)
    timezoned = apachedate.group(2)

    logtime = time.strptime(dated, LOG_TIME_FORMAT) #local time in timetuple
    ucttime = time.mktime(logtime) #utc time in seconds

    heretz = adj*60*60 #the timezone you want eg. SAST = +2
    mytime = ucttime + heretz

    newdate = '[%s %+05d]' %(time.strftime(LOG_TIME_FORMAT, time.gmtime(mytime)),adj*100)
    newdatestr = datereg.sub(newdate, str)
    return newdatestr

def getlogentrydate(str):
    apachedate = datereg.search(str)
    dated = apachedate.group(1)
    timezoned = apachedate.group(2)

    logtime = time.strptime(dated, LOG_TIME_FORMAT) #local time in timetuple
    ucttime = time.mktime(logtime) #utc time in seconds
    return float(ucttime)

def getlogurl(str):
    logurl = logreg.search(str).group(2)
    return logurl

def print_rss (items, date):
    if os.getenv('HTTP_USER_AGENT', 'N/A').find('Mozilla') >= 0:
        print "content-type: application/xml\nCache-Control: no-cache\n"
    else:
        print "content-type: application/rss+xml\nCache-Control: no-cache\n"
    print RSS_TEMPLATE % {
        'SCRIPT_NAME':os.getenv('SCRIPT_NAME',' '),
        'DATE':date, 'ITEMS':items,
        'HOST':os.getenv('HTTP_HOST',host)
        }

################################ start of main program ################################

str = ''
items = []
latest = 0.0
output = []

form = cgi.FieldStorage()
arg = form.getfirst('search', search_string)
adjt = form.getfirst('adjust', time_adjust)

norm = string.maketrans('', '') #builds list of all characters
# remove shell control characters just in case 
arg = string.translate(arg, norm, r'\;&|()')
adjt = string.translate(adjt, norm, r'\;&|()')

#first of two methods to get logfile
lfile = open(logfile,'r')
loglines = lfile.readlines()
lfile.close()
for line in loglines:
    if line.lower().find(arg) >= 0:
        output.append(line)

#second method
#cmd = 'grep -i %s %s 2>&1' %(arg, logfile)
#output = os.popen(cmd).readlines()

for x in output:
    entrytime = getlogentrydate(x)
    if entrytime > latest:
        latest = entrytime
    items.append(  ITEM_TEMPLATE %{
            'DATA':cgi.escape(log2local(x,int(adjt)).rstrip()),
            'DATE':time.strftime('%a, %d %b %Y %H:%M:%S GMT',time.gmtime(entrytime)),
            'LOCALDATE':time.strftime('%a, %d %b %Y %H:%M:%S ',time.gmtime(entrytime+60*60*int(adjt))) + '%+05d' %(int(adjt)*100),
            'HOST':os.getenv('HTTP_HOST',host),
            'URL':getlogurl(x) }  )

items.reverse()
print_rss( ''.join(items[:max]) , time.strftime('%a, %d %b %Y %H:%M:%S GMT',time.gmtime(latest))   )