"""
Generate ADS tagged format (as good as our source lets us) from
our catalog data base.

There's lots of ad-hoc parsing going on that I'm trying to explain where
it happens.
"""

import logging
import re

logging.basicConfig(filename="adssources.log",
	level=logging.DEBUG, filemode="w")

from gavo import api

from katkatgrammar import FS_CLEAN


class Error(Exception):
	pass


# this is a pattern for a single name in initials surname(s) configuration
NAMEPAT = re.compile(
	r"\s*(?P<init>(?:[A-Z][haup]?\.\s*)+)"
	r"(?P<sur>(?:[A-Z][a-z']+)([- ][A-Z][a-z']+)*)"
	r"([,.]|,?\s*et\.? al\.?,)")


def logWithRow(msg, record):
	logging.info("%s (%s)"%(msg, record))


def extractYear(source):
# heuristic: the last 4-digit number in source
	try:
		mat = list(re.finditer(r"\d\d\d\d", source))[-1]
	except IndexError:
		raise Error("No year in '%s'"%source)
	return mat.group(), mat.start()


def extractNames(source):
# we just match the pattern above; there's a "sur" group in there
# for the surname.  We need all the repeated matching thing since
# ADS wants the authors in a syntax quite different from what we
# get from katkat.
	names, pos = [], 0
	while True:
		mat = NAMEPAT.match(source, pos)
		if not mat:
			break
		pos = mat.end()
		d = mat.groupdict()
		if d["sur"]:  # else it's et al, which we choose to ignore here
			names.append("%s, %s"%(d["sur"].strip(), d["init"].strip()))
	if not names:
		raise Error("No names in '%s'"%source)
	return "; ".join(names), pos


def findMatchingParen(source, parenPos):
	level = 0
	while parenPos>0:
		if source[parenPos]=="(":
			level -= 1
		elif source[parenPos]==")":
			level += 1
		if level==0:
			break
		parenPos -= 1
	if parenPos==0:
		raise Error("No matching paren")
	return parenPos


def romanToInt(input):
   vals = {'M': 1000, 'D': 500, 'C': 100, 'L': 50, 'X': 10, 'V': 5, 'I': 1}
   places = [vals[c] for c in input.upper()]
   sum = 0
   for ind, value in enumerate(places):
      if ind<len(places)-1 and places[ind+1]>value:
         sum -= value
      else:
         sum += value
   return sum


def arabize(input):
	try:
		int(input)
	except ValueError:  #vol is a roman numeral
		return str(romanToInt(input))
	return input


def interpretBibsource(bibsource, codes):
	# Let's see if bibsource looks like a serial ("Vol.")
	if "Vol." in bibsource:
		# What's in front of the Vol is the serial title, and we want a page
		# number
		mat = re.match(r"(.*),? Vol\. (\d+|[IVXLC]+)", bibsource)
		if mat:
			jnl, vol = mat.groups()
			codes.append(('J', jnl))
			codes.append(('V', arabize(vol)))
		else:
			raise Error("Weird bibsource with volume '%s'"%bibsource)
		# If there's a visible page in there, use it
		mat = re.search(r"Vol\. (\d+|[IVXLC]+).*p\. ([ACL]?\d+)", bibsource)
		if mat:
			codes.append(('P', mat.group(2)))
		return

	# Ok, nothing with a vol inside; it's probably a standalone publication.
	# Dump all in the J field
	codes.append(('J', bibsource))


def addBibstuff(source, namesEnd, codes):
# heuristic: the bibliographic information is what's in the last pair of parens.
	lastParen = source.rfind(")")
	matchingParen = findMatchingParen(source, lastParen)
	title = source[namesEnd:matchingParen].strip()
	codes.append(('T', title))
	interpretBibsource(source[matchingParen+1:lastParen].strip(), codes)


def extractSource(row):
# "source" is just the content of the source field, slightly cleaned.
# (in particular, the "abbreviations" mess is removed.
	source = row["source"]
	if source is None:
		raise Error("No source")
	if "abbreviations:" in source:
		return source[:source.find("abbreviations:")+1]
	else:
		return source


def main():
	shippedHDWs = set()
	table = api.TableForDef(api.getRD("arigfh/katkat").getById("katkat"))
	for row in table.iterQuery(table.tableDef, ""):
		try:
			# HDWL is something like source.volume.  We don't want multiple
			# entries for multiple volumes, so we skip all sub-volumes.
			hdw = int(row["hdwl"].split(".")[0])
			if hdw in shippedHDWs:
				continue
			shippedHDWs.add(hdw)

			source = extractSource(row)
			year, yearStart = extractYear(source)
			names, namesEnd = extractNames(source)
			
			codes = [
				("A", names),
				("Y", year),
				("G", "KATKAT"),
				("E", "http://dc.g-vo.org/arigfh/katkat/byhdw/qp/%s"%hdw),
			]
			
			addBibstuff(source, namesEnd, codes)

			print "\n".join("%%%s %s"%t for t in codes)
			print
		except Error, ex:
			logWithRow(str(ex), row)


if __name__=="__main__":
	main()
