"""
This script computes a partition for Gaia-like data that lets people
exhaustively query the whole data set without ever touching more than
MAX_ROWS rows at a time (within reason).

Note that not all corner cases are properly handled; in particular,
MAX_ROWS may be exceeded if a single input bin already has more
objects in it.  Increase ID_DIVISOR in that case.
"""

import os

from astropy import table
import pyvo

MAX_ROWS = 5000000

# the divisor here is max(source_id)/10000 -- but it's probably wiser
# to just use 2e14 or so.
ID_DIVISOR = 691752899328320

QUERY = """
select round(source_id/%d) as bin, count(*) as ct
from gaia.dr1
group by bin
"""%ID_DIVISOR


def get_bin_sizes():
	"""returns a ordered sequence of (bin_center, num_objects) rows.
	"""
	try:
		with open("partitions.vot", "rb") as f:
			tbl = table.Table.read(f)
	except IOError:
		# Fetch from source; takes about 1 hour
		print("Fetching partitions from source; this will take a while"
			" (provide partitions.vot to avoid re-querying)")
		svc = pyvo.dal.TAPService("http://dc.g-vo.org/tap")
		res = svc.run_async(QUERY, maxrec=1000000)
		tbl = res.table
	
	res = [(row["bin"], row["ct"]) for row in tbl]
	res.sort()
	return res


def get_partition_limits(bin_sizes):
	"""returns a list of limits of source_id ranges exhausting the whole
	catalog.

	bin_sizes is what get_bin_sizes returns (and it must be sorted by
	bin center).
	"""
	limits, cur_count = [0], 0
	for bin_center, bin_count in bin_sizes:
		if cur_count+bin_count>MAX_ROWS:
			limits.append(int(bin_center*ID_DIVISOR-ID_DIVISOR/2))
			cur_count = 0
		cur_count += bin_count
	limits.append(int(bin_center*ID_DIVISOR+ID_DIVISOR/2+2))
	return limits


if __name__=="__main__":
	print(get_partition_limits(
		get_bin_sizes()))
