"""
A custom grammar to parse and clean the logs left by the twistd web server.
"""

import os
import re
import shutil
import sys
import time

from gavo import base
from gavo.grammars.customgrammar import CustomRowIterator
from gavo.web import root

from gavo import api

sys.path.append(os.path.join(base.caches.getRD("logs/logs").resdir, "bin"))

import pseudonymize


serviceRenderers = set(["form", "custom", "siap.xml", "scs.xml", "soap",
	"tap", "qp", "dlget", "dlmeta"])

ignoredIPs = ["129.206.110.59", "147.142.207.22", "129.206.119.232",
	"127.0.0.1", "129.206.110.245", "129.206.110.156", "129.206.110.85"]

ignorePattern = re.compile(r"\.php" # most likely attack bots
	"|regrss/"      # too many of those
)


class HostInfos(object):
	"""is a container for data we know about certain hosts.
	"""
	robotClassBs = set(["11.104", "74.6", "65.55", "67.195",
		"66.249", "202.160"])
	robotSigs = ["Googlebot", "NOLOG", "nagios", "PubSub", "kanla",
		"Haosou", "Baiduspider", "bingbot", "SocialRankIOBot", "PaperLiBot",
		"Yahoo! Slurp", "AhrefsBot" "DaCHS regression"]

	def __init__(self):
		self.robotsHosts = set(ignoredIPs)
	
	def hostIsRobot(self, logRow):
		if ".".join(logRow["ip"].split(".")[:2]) in self.robotClassBs:
			return True
		if logRow["uri"]=="/robots.txt":
			self.robotsHosts.add(logRow["ip"])
		if logRow["ip"] in self.robotsHosts:
			return True
		if logRow["agent"]:
			for sig in self.robotSigs:
				if sig in logRow["agent"]:
					return True
		return False


class DataPack(object):
	"""A collection of data shared for an entire import session.

	In particular, this contains the HostInfos noting which IPs we do
	not want to log, and the pseudnymizer.
	"""
	def __init__(self, grammar):
		self.hostInfos = HostInfos()
		self.pseudo = pseudonymize.Pseudonymizer()
		self.thisYear, self.thisMonth = time.localtime()[:2]
		self.destPath = os.path.join(grammar.rd.resdir, "delete")
		if not os.path.exists(self.destPath):
			os.makedirs(self.destPath)


def makeDataPack(grammar):
	return DataPack(grammar)


_logPat = re.compile("(?:[^ ]+ ){3}(?P<ip>[^ ]+) [^ ]+ [^ ]+"
	r' \[(?P<timestamp>[^]]+)\] "(?P<request>[^"]+)" (?P<status>\d+) '
	r'(?P<length>[^ ]*) [^ ]* "(?P<agent>[^"]*)"')

class RowIterator(CustomRowIterator):
	keepInputs = False

	_validMethods = set(["GET", "POST", "HEAD", "PUT", "COPY"])

	def _processMatch(self, row):
		dp = self.grammar.dataPack
		try:
			method, uri, _ = row["request"].split(" ")
		except ValueError:
			# These happen when the request has lost memory of what it's done
			# and it's writing something like "no uri yet".
			method, uri = "GET", "UNKNOWN_URI"
		service, renderer = None, None
		parts = uri[1:].split("/")
		if parts:
			if parts[0] in root.ArchiveService.redirects:
				parts = root.ArchiveService.redirects[parts[0]].split("/")
			elif parts[0] in root.ArchiveService.mappings:
				parts = root.ArchiveService.mappings[parts[0]]+parts[1:]
			renderer = parts[-1].split("?")[0]
			if renderer in serviceRenderers:
				service, renderer = parts[0], parts[-1]
			if service=="__system__" and len(parts)>1:
				service = parts[1]
		row["method"], row["uri"] = method, uri[:200]
		row["service"], row["renderer"] = service, renderer
		row["acc_month"], row["acc_year"] = dp.thisMonth, dp.thisYear
		row["handle"] = dp.pseudo.pseudo(row["ip"])
		return row

	def _saveForEmergencies(self):
		if self.keepInputs:
			return

		i = 0
		destPath = self.grammar.dataPack.destPath
		while True:
			destName = os.path.join(destPath, "web.log.%d"%i)
			if not os.path.exists(destName):
				break
			i += 1
		shutil.move(self.sourceToken, destName)

	def _iterRows(self):
		self.lineNumber = 0
		with open(self.sourceToken) as f:
			for self.lineNumber, ln in enumerate(f):
				mat = _logPat.match(ln)
				if not mat:
					continue
				row = self._processMatch(mat.groupdict())
				if row["method"] not in self._validMethods:
					continue
				if ignorePattern.search(row["uri"]):
					continue
				if not self.grammar.dataPack.hostInfos.hostIsRobot(row):
					yield row

		# The stuff saved here is removed by a cron job in the logs/logs RD
		self._saveForEmergencies()

	def getLocator(self):
		return "line %s"%self.lineNumber

if __name__=="__main__":
	from gavo import api
	dp = makeDataPack(None)
