"""
A one-time hack to copy catalog inputs not processed from arigfh to
the cats directory.

This assumes that the data found in /work/www-intranet/katkat/daten/notarigfh
in 2010 is in tmp/ and that the cats subdirs already have been created.

You also need the mapping file written by katkatgrammar.
"""

from __future__ import with_statement

import glob
import re
import os
import sys
import tarfile


def getMapping():
	return dict((r[2], r[1])
		for r in (ln.strip().split("\t")
				for ln in open("res/mapping") if not ln.startswith("#")))


def getData(tarName):
	tf = tarfile.open(tarName, "r:*")
	members = tf.getmembers()
	assert len(members)==1
	res = tf.extractfile(members[0]).read()
	tf.close()
	return res


fsClean = re.compile("[()/]")


if __name__=="__main__":
	#sys.exit("You probably don't want to run this.")
	mapping = getMapping()
	for name in glob.glob("tmp/*.tar.gz"):
		key = os.path.basename(name)[:-7]
		if not key in mapping:
			# this should be "MAKULA", files obsoleted.  Most are :-)
			continue
		destDir = "cats/katkat/%s"%(fsClean.sub("-", mapping[key]))
		data = getData(name)
		if not os.path.isdir(destDir):
			os.mkdir(destDir)
		with open(os.path.join(destDir, "data"), "w") as f:
			f.write(data)
