"""
A grammar to parse the original katkat.data file from ARIGFH.

This grammar can also be called standalone.  It then emits a mapping
between arigfh ids, file ids, and tape storage locations, which is
in res/mapping.
"""

import os
import re
import warnings

from gavo.grammars.customgrammar import CustomRowIterator


FS_CLEAN = re.compile("[()/]")


def parseBars(rec):
	return [s.strip() for s in rec.split("|")]


class KatkatParser(object):
	"""A parser for the katkat format.

	Katkat is a sequence of lines of the format:
	NNNNNN|LLLLL|DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
	Where N make up a running number, S a section index, L a key and
	D the payload.  This induces a list of key-value sequences.

	There may be several lines for a given key in each record (defined
	by constant running number).  The semantics of such repeated keys
	depends on the key.

	The payload fields have internal structure, typically indicated by
	vertical bars.  We do these parses as well as the interpretation of
	the key value lists in the shipout method based on _parse_KEY methods.
	The parse methods manipulate the self.vals dictionary in place.  They
	should leave the original keys alone and add keys of their own.

	The actual record delivery is done by calling a function passed to
	the parser.  That function is outputBuffer.append in _iterRows.  To
	make this yet more funky, emitRecord is called from parse_FILE
	since a single FILE field may generate several records.  Therefore,
	you must always parse FILE last.

	Katkat.data has junk at the end.  It is the row iterator's job to
	detect this and stop feeding.  It must call the finish() method to
	emit the last record.
	"""
	_hdwlToTeleki = {}

	def __init__(self, emitRecord, resdir):
		self.emitRecord = emitRecord
		self.resdir = resdir
		self.curNumber = None
		self.vals = {}

	def _parse_BIBLI(self, input):
		# The rules for joining lines are chaotic; I didn't make them up.
		res = [""]
		for seg in input:
			if seg[2]==" ":
				res.append("; "+seg)
			else:
				if res[-1].endswith("-"):
					res.append(seg.lstrip())
				else:
					res.append(seg)
		self.vals["source"] = "".join(res).strip()

	def _parse_REM(self, input):
		self.vals["remarks"] = " ".join(input)

	_telekiPat = re.compile(r"(Teleki|Wielen)\s*(\d+)")
	_siehePat = re.compile(r"siehe HDWL (?:Nr. )?([\d.]+)")

	def _parse_CODES(self, input):
		self.vals["teleki"] = None
		allCodes = " ".join(input)
		mat = self._telekiPat.search(allCodes)
		if mat:
			self.vals["teleki"] = int(mat.group(2))
			self._hdwlToTeleki[self.curNumber.strip().rstrip(".")
				] = self.vals["teleki"]
		else:
			mat = self._siehePat.search(allCodes)
			if mat:
				try:
					self.vals["teleki"] = self._hdwlToTeleki[mat.group(1).rstrip(".")]
				except KeyError:
					pass # tough luck
					#warnings.warn("No Teleki for %s"%mat.group(1))
			else:
				pass

	def _parse_AGF_1(self, input):
		# arigfh lines (there's only 1 here, no 2)
		# this is probably: tXpY|file id|master cat ver?|#id|?|#total
		for ln in input:
			kkid, fid, _, nid, _, ntotal, _ = parseBars(ln)
			agfData = self.vals.setdefault("agfData", {})
			agfData[fid] = (kkid, nid, ntotal)
	
	def _parse_HDWL1(self, input):
		assert len(input)==1
		_, _, baseId, eq, minEpoch, maxEpoch, _, minDec, maxDec, _ = \
			parseBars(input[0])
		self.vals["minEpoch"] = minEpoch
		self.vals["maxEpoch"] = maxEpoch
		self.vals["equinox"] = eq

	_interestingFtypes = set(["ORIG", "SONDER", "ZUSATZ"])

	def _parse_FILE(self, input):
		self.vals["hdwl"] = self.curNumber.strip()
		for rec in input:
			try:
				ftype, tape, tlabel, name, fid, numRecs, _ = parseBars(rec)
			except ValueError:  # some FILE lines are malformed.  Ignore these
				continue
			if ftype in self._interestingFtypes:
				rec = self.vals.copy()
				rec["fileid"] = fid
				if tlabel:
					rec["tapelabel"] = "%s-%03d"%(tape, int(tlabel))
				else:
					rec["tapelabel"] = None
				rec["kkid"], rec["nid"], rec["nrows"] = (
					self.vals.get("agfData", {}).get(fid, (None, None, None)))
				resBase = os.path.join(self.resdir, "cats", "katkat")
				resDir = FS_CLEAN.sub("-", fid)
				katdata = os.path.join(resDir, "data.txt")
				if os.path.exists(os.path.join(resBase, katdata)):
					rec["katdata"] = katdata
				katfields = os.path.join(resDir, "fields.pdf")
				if os.path.exists(os.path.join(resBase, katfields)):
					rec["katfields"] = katfields

				# for catalogs with teleki numbers, see if there's a lies.f
				if rec.get("kkid"):
					liesPath = os.path.join(self.resdir, "cats", "arigfh",
						rec["kkid"].lower(), "lies.f")
					if os.path.exists(liesPath):
						rec["liesf"] = os.path.join(
							"arigfh", "q", "cone", "static", rec["kkid"].lower(), "lies.f")

				self.emitRecord(rec)

	def shipout(self):
		if not self.vals:
			return
		for key in ["BIBLI", "CODES", "AGF_1", "REM", "HDWL1", "FILE"]:
			val = self.vals.get(key)
			if val is not None:
				getattr(self, "_parse_"+key)(val)
		self.vals = {}

	def feedLine(self, inLine):
		runningNumber, key, payload = inLine[:6], inLine[7:12], inLine[13:]
		key = key.strip().replace(" ", "_")
		if runningNumber!=self.curNumber:
			self.shipout()
			self.curNumber = runningNumber
		self.vals.setdefault(key, []).append(payload)

	def finish(self):
		self.shipout()


class RowIterator(CustomRowIterator):
	"""source token must be a resdir-relative path to katkat.data.

	That's data/katkat.data, but let's keep that in the RD.  If that says
	something else than I, it is right.
	"""
	def _iterRows(self):
		outputBuffer = []
		katkatParser = KatkatParser(outputBuffer.append, self.grammar.rd.resdir)
		for  line in open(self.sourceToken):
			if line.startswith("******"):  # end of records
				break
			katkatParser.feedLine(line[:-1])
			for rec in outputBuffer:
				yield rec
			outputBuffer[:] = []
		katkatParser.finish()
		for rec in outputBuffer:
			yield rec


if __name__=="__main__":
	from gavo import api
	RowIterator.debug = True
	ri = RowIterator(api.getRD("arigfh/katkat").getById("grammar"),
		"data/katkat.data")
	print "# generated by bin/katkatgrammar.py"
	for item in ri:
		print "%(kkid)s\t%(fileid)s\t%(tapelabel)s"%item
