<resource schema="accesslogs">
	<meta name="title">Access logs for \getConfig{web}{siteName}</meta>
	<meta name="description">
		This service provides aggregated access statistics.  We go to some
		length to try and filter out robot accesses, but there will certainly
		be quite a bit of contamination.  If interested in accesses to
		specific services, use REs against the uri field (which contains
		the path part of the request URI).
	</meta>


	<execute every="2000000" title="vacuum logs table">
		<job>
			<code>
				with base.getAdminConn() as conn:
					conn.execute("vacuum analyze \schema.accs")
			</code>
		</job>
	</execute>

	<execute at="2:30" title="ingest last day's accesses" id="c_ingest">
		<job>
			<code>
				execDef.spawn(["dachs", "imp", "accesslogs/q"])
			</code>
		</job>
	</execute>

	<execute at="m1 5:00" title="Generate OTP for IP shrouding" id="genotp">
		<job>
			<code>
				execDef.spawnPython("res/logparser.py")
			</code>
		</job>
	</execute>

	<table id="accs" onDisk="True">
		<index columns="acc_year"/>
		<index columns="acc_month"/>
		<index columns="acc_time" cluster="True"/>
		<index columns="handle"/>
		<index columns="uri">uri text_pattern_ops</index>

		<column name="acc_month" type="smallint" required="True"
			tablehead="Month"
			description="Month the access is logged for (handles for
				different months cannot be compared)"/>
		<column name="acc_year" type="smallint" required="True"
			tablehead="Year"
			description="Year the access is logged for."/>
		<column name="acc_time" type="timestamp"
			tablehead="Time"
			description="Local time of the access."/>
		<column name="handle" type="bigint" required="True"
			tablehead="Handle"
			description="Identifier for the IP the access came from.
				This is a hash of the IP plus monthly changing random data"/>
		<column name="uri" type="text"
			tablehead="URI"
			description="The URI accessed (without scheme and authority)."/>
		<column name="status" type="smallint" required="True"
			tablehead="Status"
			description="The server status returned"/>
		<column name="resplen" type="bigint" tablehead="#Bytes"
			description="Response length.">
			<values nullLiteral="0"/>
		</column>
		<column name="agent" type="text"
			tablehead="Agent"
			description="User agent used."/>
		<column name="method" type="text" tablehead="Method"
			description="HTTP method of the access.">
			<values>
				<option>GET</option>
				<option>POST</option>
				<option>HEAD</option>
				<option>PUT</option>
				<option>COPY</option>
			</values>
		</column>
		<column name="origin" type="text"
			tablehead="Origin"
			description="Country of origin of this request as per python-geoip."/>
	</table>

	<data id="import" updating="True">
		<customGrammar module="res/logparser"/>
		<sources item="archived web.log-s"/>
		<make table="accs">
			<script type="sourceDone" lang="python" name="commit changes"
				notify="False">
				# we want to commit so what is moved out of logs really
				# ends up in the db (by and large).  The sourceDone
				# signal is emitted manually by the grammar, as the
				# normal sources mechanism is ignored (see logparser.py for why
				# that is).
				table.connection.commit()
			</script>

			<rowmaker idmaps="*">
				<map dest="resplen" nullExcs="ValueError">int(@length)</map>
				<map dest="acc_time">@timestamp</map>
			</rowmaker>
		</make>

	</data>


	<service id="stats" allowed="form,api" defaultRenderer="form">
		<meta name="_intro" format="rst">This service lets your query for
		service usage on the Data Center.   See the service info on what data
		is exposed here.

		No field is required -- leave out year and month for all available
		data, leave service blank for all recorded accesses.
		</meta>


		<fancyQueryCore id="statscore" queriedTable="accs" timeout="20">
			<query>
				SELECT acc_year+(acc_month-1)/12. as monspec,
					count(*) AS hosts,
					sum(hostAccs) AS accesses
				FROM (
					SELECT acc_month, acc_year, handle,
						count(handle) as hostAccs
					FROM accesslogs.accs %s
					GROUP BY acc_month, acc_year, handle) AS q
				GROUP BY acc_month, acc_year
				ORDER BY acc_year, acc_month
			</query>
			<condDesc buildFrom="uri"/>
			<condDesc>
				<inputKey original="acc_month" showItems="8" multiplicity="multiple"
					required="False">
					<values>
					<LOOP listItems="1 2 3 4 5 6 7 8 9 10 11 12">
						<events>
							<option>\item</option>
						</events>
					</LOOP>
					</values>
				</inputKey>
			</condDesc>
			<condDesc buildFrom="acc_year"/>
			<condDesc buildFrom="method"/>
			<outputTable namePath="accesses">
				<outputField name="monspec" tablehead="Start"
					description="Characteristic date as fractional year"
					verbLevel="1" unit="yr" displayHint="sf=2"/>
				<outputField name="hosts" tablehead="#Hosts" type="integer"
					description="Number of distinct hosts in the month."
					verbLevel="1" required="True"/>
				<outputField name="accesses" tablehead="#Accesses" type="integer"
					description="Total number of accesses in the month."
					verbLevel="1" required="True"/>
			</outputTable>
		</fancyQueryCore>
	</service>

	<regSuite title="accesslogs regression" sequential="True">
		<regTest title="accesslogs form present">
			<!-- really, we're just using that to compute a reasonable URI
				for the next test -->
			<setup imports="subprocess"/>
			<url>stats/form</url>
			<code>
				self.assertHasStrings(
					"Access logs for {}".format(base.getConfig("web", "siteName")))

				# on a test site, the execs up in the RD may not run.
				subprocess.call(["dachs", "--ui=null", "imp", rd.sourceId])

				# make the next request records for ~the previous month
				aMonthAgo = datetime.datetime.now()-datetime.timedelta(days=20)
				self.followUp.url.content_ = (self.url.content_+"?"
					+urllib.parse.urlencode(
						{"acc_month": aMonthAgo.month, "acc_year": aMonthAgo.year,
							"__nevow_form__": "genForm"}))
			</code>
		</regTest>

		<regTest title="There are some accesses in the month 20 days ago">
			<url>overridden in the previous test</url>
			<code><![CDATA[
				self.assertHasStrings("Matched: 1", "#Hosts")
				self.assertTrue(re.search(br"<td>[\d.]+</td>", self.data),
					"No result td-s?")
			]]></code>
		</regTest>
	</regSuite>
</resource>
