"""
A custom grammar to parse the HTML source descriptions for the
maser data.
"""

from gavo.grammars.customgrammar import CustomRowIterator

import os
import re
import urllib.parse

import bs4


def scrubHTML(str):
	return re.sub("&([^;]*);", r" \1",
		re.sub("\s+", " ",
			re.sub("<[^>]*>", " ", str))).strip()


def extractFromModifications(key, value):
	yield key.lower(), re.sub("<!--.*?-->", "",
		"".join(str(s) for s in value.contents))

extractFromComments = extractFromModifications
extractFromInputTables = extractFromModifications


def _interpretFunkyTable(table):
# TODO: with non OH-masers, we'll need to do something about this.
	if table is None:
		return

	rows = table.findAll("tr")
	yield "sourcesObserved", re.search(
		r"=(\d+)", rows[0].find("td").string).group(1)
	detectedRow = rows[1].findAll("td")
	yield "det1612", detectedRow[1].string.strip()
	yield "det1665", detectedRow[2].string.strip()
	try:
		yield "det1667", detectedRow[3].string.strip()
	except IndexError:
		pass

	notDetectedRow = rows[2].findAll("td")
	yield "ndet1612", notDetectedRow[1].string.strip()
	yield "ndet1665", notDetectedRow[2].string.strip()
	try:
		yield "ndet1667", notDetectedRow[3].string.strip()
	except IndexError:
		pass

def extractFromDescription(key, value):
	parts = value.contents
	yield "textref", ", ".join(
		s.strip() for s in [parts[0], parts[2], parts[4]])
	yield "bibcode", urllib.parse.unquote(
		re.sub(".*=", "", value.find("a")["href"])).split("/")[-1]
	for part in parts:
		if part.string and part.string.strip().startswith("Coordinates:"):
			yield "cooSrc", part.string.split(":")[1].strip()
	for pair in _interpretFunkyTable(value.find("table")):
		yield pair


def extractRow(trElement):
	tds = trElement.findAll("td", recursive=False)
	if len(tds)!=2:
		return
	key = tds[0].string[:-1].replace(" ", "")
	value = tds[1]
	try:
		for pair in globals()["extractFrom"+key](key, value):
			yield pair
	except KeyError:
		pass


def getDictForFile(path):
	with open(path) as f:
		soup = bs4.BeautifulSoup(f)
	rawData = soup.find("table", width="475")
	if rawData is None:
		rawData = soup.find("table", height="100%")

	rawRow = []
	for el in rawData.findAll("tr", recursive=False):
		rawRow.extend(extractRow(el))
	return dict(rawRow)


class RowIterator(CustomRowIterator):
	def _iterRows(self):
		res = getDictForFile(self.sourceToken)
		res["ref_key"] = "/".join(
			os.path.splitext(self.sourceToken)[0].split("/")[-2:])
		yield res

	def getLocator(self):
		return "File %s"%self.sourceToken