#!/usr/bin/python
################################################################################
# Copyright (c) 2005 Francois du Toit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
################################################################################
import os, cgi, time, re, string
#import cgitb; cgitb.enable()
# Location of the apache access log file on server
logfile = '/var/log/httpd/access_log'
#host without slashes or http:// (will only be used if environment var is not available
host = 'www.floatinginspace.za.org'
# Maximum number of entries
max = 20
# Case insensitive string to filter for in logs
search_string = 'googlebot/'
# adjust time from GMT
time_adjust = '2'
RSS_TEMPLATE = """
GoogleBot log entries for %(HOST)s
http://%(HOST)s/
Latest crawls by GoogleBot%(DATE)sen
%(ITEMS)s
"""
ITEM_TEMPLATE = '''
%(URL)s on %(LOCALDATE)s
http://%(HOST)s%(URL)s
%(DATE)s%(DATA)s'''
LOG_TIME_FORMAT = "%d/%b/%Y:%H:%M:%S"
datereg = re.compile('\[([0-9]+/.../[0-9][0-9][0-9][0-9]:[0-9][0-9]:[0-9][0-9]:[0-9][0-9]) (.[0-9][0-9][0-9][0-9])\]')
logreg = re.compile('"(GET|HEAD) (\S+) \S+" \d+')
def log2local(str, adj):
apachedate = datereg.search(str)
dated = apachedate.group(1)
timezoned = apachedate.group(2)
logtime = time.strptime(dated, LOG_TIME_FORMAT) #local time in timetuple
ucttime = time.mktime(logtime) #utc time in seconds
heretz = adj*60*60 #the timezone you want eg. SAST = +2
mytime = ucttime + heretz
newdate = '[%s %+05d]' %(time.strftime(LOG_TIME_FORMAT, time.gmtime(mytime)),adj*100)
newdatestr = datereg.sub(newdate, str)
return newdatestr
def getlogentrydate(str):
apachedate = datereg.search(str)
dated = apachedate.group(1)
timezoned = apachedate.group(2)
logtime = time.strptime(dated, LOG_TIME_FORMAT) #local time in timetuple
ucttime = time.mktime(logtime) #utc time in seconds
return float(ucttime)
def getlogurl(str):
logurl = logreg.search(str).group(2)
return logurl
def print_rss (items, date):
if os.getenv('HTTP_USER_AGENT', 'N/A').find('Mozilla') >= 0:
print "content-type: application/xml\nCache-Control: no-cache\n"
else:
print "content-type: application/rss+xml\nCache-Control: no-cache\n"
print RSS_TEMPLATE % {
'SCRIPT_NAME':os.getenv('SCRIPT_NAME',' '),
'DATE':date, 'ITEMS':items,
'HOST':os.getenv('HTTP_HOST',host)
}
################################ start of main program ################################
str = ''
items = []
latest = 0.0
output = []
form = cgi.FieldStorage()
arg = form.getfirst('search', search_string)
adjt = form.getfirst('adjust', time_adjust)
norm = string.maketrans('', '') #builds list of all characters
# remove shell control characters just in case
arg = string.translate(arg, norm, r'\;&|()')
adjt = string.translate(adjt, norm, r'\;&|()')
#first of two methods to get logfile
lfile = open(logfile,'r')
loglines = lfile.readlines()
lfile.close()
for line in loglines:
if line.lower().find(arg) >= 0:
output.append(line)
#second method
#cmd = 'grep -i %s %s 2>&1' %(arg, logfile)
#output = os.popen(cmd).readlines()
for x in output:
entrytime = getlogentrydate(x)
if entrytime > latest:
latest = entrytime
items.append( ITEM_TEMPLATE %{
'DATA':cgi.escape(log2local(x,int(adjt)).rstrip()),
'DATE':time.strftime('%a, %d %b %Y %H:%M:%S GMT',time.gmtime(entrytime)),
'LOCALDATE':time.strftime('%a, %d %b %Y %H:%M:%S ',time.gmtime(entrytime+60*60*int(adjt))) + '%+05d' %(int(adjt)*100),
'HOST':os.getenv('HTTP_HOST',host),
'URL':getlogurl(x) } )
items.reverse()
print_rss( ''.join(items[:max]) , time.strftime('%a, %d %b %Y %H:%M:%S GMT',time.gmtime(latest)) )