From f8a5720535e8801fcce3dcb8d78fddf84ffb56f0 Mon Sep 17 00:00:00 2001 From: Will Woods Date: Oct 13 2020 16:17:00 +0000 Subject: add 'countme' stuff to web-data-analysis role This should automate running the "countme" scripts every day to parse new log data and publish updated totals. Here's what I've added to the ansible role: * install package deps for `mirrors-countme` * make "countme" user with home /srv/countme * clone 'prod' branch of https://pagure.io/mirrors-countme to /srv/countme * if changed: pip install /srv/countme/mirrors-countme * make web subdir /var/www/html/csv-reports/countme * make local data dir /var/lib/countme * install `countme-update.sh` to /usr/local/bin * install `countme-update.cron` to /etc/cron.d * runs /usr/local/bin/countme-update.sh daily, as user `countme` That should make sure `countme-update.sh` runs every day. That script works like this: 1. Run `countme-update-rawdb.sh` * parse new mirrors.fp.o logs in /var/log/hosts/proxy* * write data to /var/lib/countme/raw.db 2. Run `countme-update-totals.sh` * parse raw data from /var/lib/countme/raw.db * write updated totals to /var/lib/countme/totals.{db,csv} 3. Track changes in updated totals * set up /var/lib/countme as git repo (if needed) * commit new `totals.csv` (if changed) 4. Make updated totals public * Copy totals.{db,csv} to /var/www/html/csv-reports/countme For safety's sake, I've tried to set up everything so it runs as the `countme` user rather than running everything as `root`. This might be an unnecessary complication but it seemed like the right thing to do. Similarly, keeping totals.csv in a git repo isn't _required_, but it seemed like a good idea to keep historical records in case we want/need to change the counting algorithm or something. I checked the YAML with ansible-lint and tested that all the scripts work as expected when run as `wwoods`, so unless I've missed something this should do the trick. --- diff --git a/roles/web-data-analysis/files/countme-update.cron b/roles/web-data-analysis/files/countme-update.cron new file mode 100644 index 0000000..3134e0f --- /dev/null +++ b/roles/web-data-analysis/files/countme-update.cron @@ -0,0 +1 @@ +0 09 * * * countme /usr/local/bin/countme-update.sh > /dev/null diff --git a/roles/web-data-analysis/files/countme-update.sh b/roles/web-data-analysis/files/countme-update.sh new file mode 100644 index 0000000..b51b51d --- /dev/null +++ b/roles/web-data-analysis/files/countme-update.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# Where do we keep our local/internal data? +LOCAL_DATA_DIR=/var/lib/countme +RAW_DB=$LOCAL_DATA_DIR/raw.db +TOTALS_DB=$LOCAL_DATA_DIR/totals.db +TOTALS_CSV=$LOCAL_DATA_DIR/totals.csv + +# Where do we put the public-facing data? +PUBLIC_DATA_DIR=/var/www/html/csv-reports/countme +PUBLIC_TOTALS_DB=$PUBLIC_DATA_DIR/totals.db +PUBLIC_TOTALS_CSV=$PUBLIC_DATA_DIR/totals.csv + +# Names of the update commands (if not in $PATH..) +UPDATE_RAWDB=countme-update-rawdb.sh +UPDATE_TOTALS=countme-update-totals.sh + +# Copy with atomic overwrite +atomic_copy() { + local src="$1" dst="$2" + cp -f ${src} ${dst}.part + mv -f ${dst}.part ${dst} +} + +# die [MESSAGE]: prints "$PROG: error: $MESSAGE" on stderr and exits +die() { echo "${0##*/}: error: $*" >&2; exit 2; } + +# _run [COMMAND...]: Run a command, honoring $VERBOSE and $DRYRUN +_run() { + if [ "$VERBOSE" -o "$DRYRUN" ]; then echo "$@"; fi + if [ "$DRYRUN" ]; then return 0; else "$@"; fi +} + +# CLI help text +HELP_USAGE="usage: countme-updates.sh [OPTION]..." +HELP_OPTIONS=" +Options: + -h, --help Show this message and exit + -v, --verbose Show more info about what's happening + -n, --dryrun Don't run anything, just show commands + -p, --progress Show progress meters while running +" + +# Turn on progress by default if stderr is a tty +if [ -z "$PROGRESS" -a -t 2 ]; then PROGRESS=1; fi + +# Parse CLI options with getopt(1) +_GETOPT_TMP=$(getopt \ + --name countme-update \ + --options hvnp \ + --longoptions help,verbose,dryrun,progress,checkoutdir: \ + -- "$@") +eval set -- "$_GETOPT_TMP" +unset _GETOPT_TMP +while [ $# -gt 0 ]; do + arg=$1; shift + case $arg in + '-h'|'--help') echo "$HELP_USAGE"; echo "$HELP_OPTIONS"; exit 0 ;; + '-v'|'--verbose') VERBOSE=1 ;; + '-n'|'--dryrun') DRYRUN=1 ;; + '-p'|'--progress') PROGRESS=1 ;; + # Hidden option for testing / manual use + '--checkoutdir') COUNTME_CHECKOUT=$1; shift ;; + '--') break ;; + esac +done + +# Tweak path if needed +if [ -d "$COUNTME_CHECKOUT" ]; then + cd $COUNTME_CHECKOUT + PATH="$COUNTME_CHECKOUT:$COUNTME_CHECKOUT/scripts:$PATH" +fi + +# Check for required commands +command -v $UPDATE_RAWDB >/dev/null || die "can't find '$UPDATE_RAWDB'" +command -v $UPDATE_TOTALS >/dev/null || die "can't find '$UPDATE_TOTALS'" +command -v git >/dev/null || die "can't find 'git'" + +# Apply other CLI options +if [ "$PROGRESS" ]; then + UPDATE_RAWDB="$UPDATE_RAWDB --progress" + UPDATE_TOTALS="$UPDATE_TOTALS --progress" +fi + +# Exit immediately on errors +set -e + +# Run the updates +_run $UPDATE_RAWDB --rawdb $RAW_DB +_run $UPDATE_TOTALS --rawdb $RAW_DB --totals-db $TOTALS_DB --totals-csv $TOTALS_CSV + +# Update local git repo +if [ ! -d $LOCAL_DATA_DIR/.git ]; then + _run git init $LOCAL_DATA_DIR + _run git -C $LOCAL_DATA_DIR add -N $(realpath $TOTALS_CSV --relative-to $LOCAL_DATA_DIR) +fi +_run git -C $LOCAL_DATA_DIR commit -a -m "$(date -u +%F) update" + +# Copy new data into place +_run atomic_copy $TOTALS_DB $PUBLIC_TOTALS_DB +_run atomic_copy $TOTALS_CSV $PUBLIC_TOTALS_CSV diff --git a/roles/web-data-analysis/tasks/main.yml b/roles/web-data-analysis/tasks/main.yml index 7dc4d74..5e64630 100644 --- a/roles/web-data-analysis/tasks/main.yml +++ b/roles/web-data-analysis/tasks/main.yml @@ -85,3 +85,57 @@ - web-data - cron +- name: install package deps for mirrors-countme + package: + # tqdm is optional but it gives nice progress meters for interactive use + name: ['python3-pip', 'python3-setuptools', 'python3-tqdm'] + state: present + tags: + - packages + - web-data + +- name: make countme user + user: + name: countme + group: countme + shell: /sbin/nologin + home: /srv/countme + comment: "DNF countme counter" + tags: + - web-data + +- name: checkout mirrors-countme from git + git: + repo: https://pagure.io/mirrors-countme + dest: /srv/countme/mirrors-countme + version: prod + register: gitcountme + tags: + - web-data + +- name: install mirrors-countme from git checkout + command: "pip install --no-index --no-deps /srv/countme/mirrors-countme" + when: "gitcountme is changed" + tags: + - web-data + +- name: make countme web subdir + file: path=/var/www/html/csv-reports/countme state=directory mode=0775 owner=countme group=countme + tags: + - web-data + +- name: make countme local data dir + file: path=/var/lib/countme state=directory mode=0775 owner=countme group=countme + tags: + - web-data + +- name: install countme script to parse new logs & update totals + copy: src=countme-update.sh dest=/usr/local/bin/ mode=0755 + tags: + - web-data + +- name: install cron file to run countme-update.sh daily + copy: src=countme-update.cron dest=/etc/cron.d/ mode=0644 + tags: + - web-data + - cron