# This script goes through the access and source url items in the RDs
# in this directory, accesses them and spits out the ones that don't work for
# one reason or another.

import glob
import re
import requests
import warnings

# Regrettably, people put up tutorials on https and then mess up
# their certificates or have them signed by folks we don't trust.
# For now, I'd say that's none of our business.
warnings.filterwarnings("ignore", module=".*connectionpool")


def iter_source_urls(rd_name, label):
	"""yields DocRegExt source urls found in an RD file.

	I'm not parsing the RDs (properly) just yet.  So, if people define
	the meta in a separate element, this will return junk.

	Let's write them in meta source format so this script doesn't have to
	depend on DaCHS.
	"""
	with open(rd_name) as f:
		for mat in re.finditer(r"{}\s*([^\s]*)".format(label), f.read()):
			yield mat.group(1)


def get_status_code(url):
	"""returns the HTTP status returned by a HEAD request to url.

	This should catch pertinent exceptions and return None when they're raised.
	"""
	try:
		return requests.head(url,
			timeout=10,
			allow_redirects=True,
			verify=False
			).status_code
	except requests.exceptions.RequestException:
		return None


def main():
	for head, pattern in [
			("Access URLs bad:", "accessURL:"),
			("Source URLs bad:", "sourceURL:")]:
		print("\n>>>>>>>>> {} <<<<<<<<<<\n".format(head))
		for src_name in glob.glob("*.rd"):
			for url in iter_source_urls(src_name, pattern):
				status_code = get_status_code(url)
				if status_code!=200:
					print("{} ({})".format(url, status_code))


if __name__=="__main__":
	main()
