"""
An "abstract" custom grammar for parsing the "gbin" files.

See README for what they actually are.

Trouble is: Gbin is a rather, erm, funky format that apparently consists of a
contatenation of zip streams embedded in java serialized objects.  Lacking
useful docs, I'm guessing.  The executable part of this grammar is about this
guessing.

The row iterator peels off the scaffolding layers and then passes on the
objects in the lists obtained from the gbin to makeRecord(obj) methods.

These have to be defined by concrete grammars (e.g., quasarsgrammar).
That's fairly easy in that bin/getschema.py spits out code that creates
such a record.
"""

import re
import sys
import zipfile
from cStringIO import StringIO

sys.path.append("/data/gavo/inputs/gums/res")
import javaobj

from gavo import base
from gavo.grammars.customgrammar import CustomRowIterator


class RowIterator(CustomRowIterator):
	def _iterRows(self):
		with open(self.sourceToken) as f:
			bytestream = f.read()

		for zipstream in iterContainedFiles(bytestream):
			try:
				zf = zipfile.ZipFile(file=StringIO(zipstream))
				names = zf.namelist()
				assert len(names)==1
				data = zf.open(names[0]).read()
				stuff = javaobj.loads(data)
				#code.interact(local=locals())
				for obj in stuff:
					yield self.makeRecord(obj)
			except:
				base.ui.notifyError("Parse error in GUMS source %s"%self.sourceToken)


def iterContainedFiles(bytestream):
	"""iterates over byte sequences corresponding to zip files within bytestream.

	I have no idea how these are actually located inside the gbin files;
	thera are, at least, not just java serialized sequences in there.  So,
	what I'm doing is locate 0xaced (start of JSO stream), 27 bytes (JSO armor),
	PK (start of zip file) and cut up the file accordingly.  Horrible?
	Yes.  But I *really* cannot be bothered to actually reverse engineer
	*this* bizarre shit.
	"""
	lastMatch = None
	for mat in re.finditer("\xac\xed.{25}PK", bytestream):
		if lastMatch is not None: # first chunk is yielded on second iteration
			yield bytestream[lastMatch.end()-2:mat.start()]
		lastMatch = mat
	if lastMatch:
		yield bytestream[lastMatch.end()-2:]


if __name__=="__main__":
	with open(sys.argv[1]) as f:
		bytestream = f.read()
	for zipstream in iterContainedFiles(bytestream):
		zf = zipfile.ZipFile(file=StringIO(zipstream))
		names = zf.namelist()
		assert len(names)==1
		data = zf.open(names[0]).read()
		stuff = javaobj.loads(data)
		#code.interact(local=locals())
		print len(stuff)
