"""
Pseudonymization for access data.

To pseudonymize, we have something like a OTP.  These are a 16 bytes, 8 of
which each are prepended and appended to whatever data is passed to pseudo
before computing a hash, which is the pseudonym of the month for the
respective IP address.

The current OTP is held in a file, etc/otp.  It is changed every month from
data in /dev/random.  Ideally, it would not be backed up...

It is generated by calling this script.  This should be done *before* the
first run of loganalyze.py in a given month.
"""

from gavo import api

RD = api.getRD("logs/logs")

class Pseudonymizer(object):
	def __init__(self):
		with open(RD.getAbsPath("etc/otp"), "rb") as f:
			self.otp = f.read(16)
	
	def pseudo(self, val):
		return hash(self.otp[:8]+val.encode("utf-8", "backslashreplace"
			)+self.otp[8:])


def _testCollisionsConsec():

	def makeIP(anInt):
		return "%d.%d.%d.%d"%((anInt>>24)&0xff, (anInt>>16)&0xff,
			(anInt>>8)&0xff, (anInt)&0xff)

	import sys
	hashes = set()
	p = Pseudonymizer()
	nColl = 0
	try:
		for i in range(2**30):
			r = p.pseudo(makeIP(i))
			if r in hashes:
				print("Collision after %d numbers"%i)
				nColl += 1
			hashes.add(r)
			if not i%500:
				sys.stderr.write("%010d\r"%i)
				sys.stderr.flush()
	except KeyboardInterrupt:
		print("Collision rate:", nColl/float(i))



def makeOTP():
	randomSrc = open("/dev/random", "rb")
	randomData = randomSrc.read(16)
	randomSrc.close()
	with open(RD.getAbsPath("etc/otp"), "wb") as f:
		f.write(randomData)


if __name__=="__main__":
	makeOTP()
