"""
A custom grammar to parse and clean the logs left by a DaCHS server
with logFormat=combined.

This simply sticks all archived web.log-s into the database and removes
them.  That's also why its source token isn't a file name, because the
server might rotate files away when we're still dealing with them.
We don't want to do any locking either, and hence we're just fetching the
files as they come.

Extra complication: We want to pseudonymise IP addresses by-month into
handles.  The salt rotation is effected when you directly execute this
file (which is done from a cron job in accesslogs/q).
"""

import glob
import io
import os
import re
import time

from gavo import base
from gavo import rsc
from gavo import utils
from gavo.grammars.customgrammar import CustomRowIterator

from gavo import api

# sanity limit: don't try more than that times in a row to empty
# out the logs dir.  On a very busy site you may need to raise this.
SANITY_LIMIT = 200

# The RD-relative path of the per-month pad for IP address shrouding
OTP_PATH = "otp.do-not-backup"

ignorePattern = re.compile(r"\.php" # most likely attack bots
	"|regrss/"      # too many of those
)


class Pseudonymizer(object):
	def __init__(self, rd, otp_path):
		self.otp_path = otp_path
		try:
			with open(rd.getAbsPath(self.otp_path), "rb") as f:
				self.otp = f.read(16)
		except IOError:
			# OTP doesn't exist yet, generate one
			self.otp = self.regenerate()
	
	def pseudo(self, val):
		return hash(self.otp[:8]+val.encode("utf-8", "backslashreplace"
			)+self.otp[8:])
	
	def regenerate(self):
		with open("/dev/random", "rb") as randomSrc:
			randomData = randomSrc.read(16)
		with open(self.otp_path, "wb") as f:
			f.write(randomData)
		return randomData


_DEFAULT_ROBOT_SIGS = """Googlebot
NOLOG
nagios
PubSub
kanla
Haosou
Baiduspider
bingbot
SocialRankIOBot
PaperLiBot
Yahoo! Slurp
AhrefsBot
IVOA-test
"""

_DEFAULT_IGNORED_IPS = """127.0.0.1
::1
"""

def readWithDefault(rd, srcName, defaultContent):
	"""returns the content of srcName (resdir-relative),
	creating it if possible with defaultContent.
	"""
	path = rd.getAbsPath(srcName)
	try:
		with open(path, "r", encoding="utf-8") as f:
			return f.read()
	except IOError as ex:
		if ex.errno!=2:
			raise
		# if the file doesn't exist, try creating it and return the default
		# either way.
		try:
			with open(path, "w", encoding="utf-8") as f:
				f.write(defaultContent)
		except IOError:
			# don't worry if we can't write to the resdir, just carry on.
			pass
		return defaultContent
	

class HostInfos:
	"""a container for data we know about certain hosts.
	"""
	robotSigs = []

	def __init__(self, rd):
		self.robotsHosts = set(l[1] for l in
			utils.iterSimpleText(
				io.StringIO(
					readWithDefault(rd, "robots-hosts.txt", _DEFAULT_IGNORED_IPS))))
		self.robotsSigs = [sig for ln, sig in
			utils.iterSimpleText(
				io.StringIO(
					readWithDefault(rd, "robots-sigs.txt", _DEFAULT_ROBOT_SIGS)))]

	def hostIsRobot(self, logRow):
		if logRow["uri"]=="/robots.txt":
			self.robotsHosts.add(logRow["ip"])
		if logRow["ip"] in self.robotsHosts:
			return True
		if logRow["agent"]:
			for sig in self.robotSigs:
				if sig in logRow["agent"]:
					return True
		return False


class DataPack(object):
	"""A collection of data shared for an entire import session.

	In particular, this contains the HostInfos noting which IPs we do
	not want to log, and the pseudnymizer.
	"""
	def __init__(self, grammar):
		self.hostInfos = HostInfos(grammar.rd)
		self.pseudo = Pseudonymizer(grammar.rd, OTP_PATH)
		self.thisYear, self.thisMonth = time.localtime()[:2]

		try:
			import GeoIP
			geoResolver = GeoIP.new(GeoIP.GEOIP_MEMORY_CACHE)
			self.toCountry = geoResolver.country_code_by_addr
		except ImportError:
			self.toCountry = lambda ip: None


def makeDataPack(grammar):
	return DataPack(grammar)


_logPat = re.compile("(?:[^ ]+ ){3}(?P<ip>[^ ]+) [^ ]+ [^ ]+"
	r' \[(?P<timestamp>[^]]+)\] "(?P<request>[^"]+)" (?P<status>\d+) '
	r'(?P<length>[^ ]*) [^ ]* "(?P<agent>[^"]*)"')

class RowIterator(CustomRowIterator):
	curFile, lineNumber = "<not parsing yet>", "<not parsing yet>"

	_validMethods = set(["GET", "POST", "HEAD", "PUT", "COPY"])

	def _processMatch(self, row):
		dp = self.grammar.dataPack
		try:
			method, uri, _ = row["request"].split(" ")
		except ValueError:
			# These happen when the request has lost memory of what it's done
			# and it's writing something like "no uri yet".
			method, uri = "GET", "UNKNOWN_URI"
		row["method"], row["uri"] = method, uri[:200]
		row["acc_month"], row["acc_year"] = dp.thisMonth, dp.thisYear
		row["handle"] = dp.pseudo.pseudo(row["ip"])
		row["origin"] = dp.toCountry(row["ip"])
		return row

	def _iterForSource(self, srcFile, srcName):
		"""yields rawdicts from the twisted combined log in srcName.

		Bad records are silently ignored.
		"""
		self.lineNumber, self.curFile = 0, srcName

		for self.lineNumber, ln in enumerate(srcFile):
			try:
				ln = ln.decode("utf-8")
			except UnicodeDecodeError:
				ln = ln.decode("iso-8859-1")

			mat = _logPat.match(ln)
			if not mat:
				continue
			row = self._processMatch(mat.groupdict())
			if row["method"] not in self._validMethods:
				continue
			if ignorePattern.search(row["uri"]):
				continue
			if not self.grammar.dataPack.hostInfos.hostIsRobot(row):
				yield row

		yield rsc.FLUSH
		# TODO: We probably want to be able to yield.rsc.COMMIT, too.
		utils.stealVar("data").runScripts(
			"sourceDone", sourceToken=srcName)
		os.unlink(srcName)

	def _iterRows(self):
		srcDir = base.getConfig("logDir")
		for i in range(SANITY_LIMIT):
			matchesLeft = glob.glob(os.path.join(srcDir, "web.log.[0-9]*"))
			if not matchesLeft:
				break
			try:
				nextName = matchesLeft.pop()
				with open(nextName, "rb") as f:
					yield from self._iterForSource(f, nextName)
			except IOError as ex:
				if ex.errno==2:
					# file was rotated away while we were preparing.  Just try again.
					continue
				raise

	def getLocator(self):
		return f"{self.curFile}, {self.lineNumber}"


if __name__=="__main__":
	p = Pseudonymizer(api.getRD("accesslogs/q"), OTP_PATH)
	p.regenerate()
