"""
Harvest DASCH plate metadata from their API.

We read the ids we need from a local file and request the metadata records
in batches to BATCH_SIZE.  The results are written into the data
folder.

Make sure that any data already present locally is imported (completely) before
running or else the script will re-download it.

API docs: https://docs.api.starglass.cfa.harvard.edu/
"""

import csv
import os
import pathlib
import time

import requests

from gavo import api
from gavo import utils

BATCH_SIZE = 500
RD = api.getRD("dasch/q")
TARGET_DIR = pathlib.Path(RD.getAbsPath("data/harvested"))
API_URL = "https://api.starglass.cfa.harvard.edu/public/plates"


def getUpstreamIds():
	with open(RD.getAbsPath("data/plates.csv"), "r", encoding="utf-8") as f:
		reader = csv.reader(f)
		# Skip header
		next(reader)
		return list(r[0] for r in reader)


def getKnownIds():
	try:
		with api.getTableConn() as conn:
			return set(r[0]
				for r in conn.query("select dasch_id from dasch.plates"))
	except api.DBError:
		# table probably not yet imported
		return set()


def retrieveByIds(idsToGet):
	result = requests.get(API_URL, params={"ids": ",".join(idsToGet)}).text
	destName = "{}-{}.json".format(idsToGet[0], idsToGet[-1])
	print(f"Getting {destName}")
	with open(TARGET_DIR/destName, "w", encoding="utf-8") as f:
		f.write(result)


def main():
	utils.ensureDir(TARGET_DIR, setGroupTo=api.getConfig("gavogroup"))
	knownIds = getKnownIds()
	stillMissing = [id for id in getUpstreamIds() if id not in knownIds]
	while stillMissing:
		getThisTime, stillMissing =\
			stillMissing[:BATCH_SIZE], stillMissing[BATCH_SIZE:]
		retrieveByIds(getThisTime)
		time.sleep(10)


if __name__=="__main__":
	main()
