Update combined datasets #4645
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Update combined datasets | |
# Use a concurrency group to make sure we don't try to have multiple workflows | |
# run with the hosted runner at the same time. | |
concurrency: gce-runner | |
on: | |
# 2023/01/17: We no longer have a run scheduled via github actions and instead rely on the prefect | |
# scrapers to kick off a build after NYT updates each day. | |
# https://github.com/covid-projections/can-scrapers/blob/643819f7a42d0ae227453b1de24a317a54c6cde8/services/prefect/flows/scheduled_nyt_and_parquet_updater.py#L37 | |
# | |
# schedule: | |
# # Run job everyday at 5:00 am EST | |
# - cron: '0 10 * * *' | |
workflow_dispatch: | |
inputs: | |
trigger_api_build: | |
description: 'If "true" API snapshot build will be triggered after dataset update.' | |
default: 'true' | |
refresh_datasets: | |
description: 'Set to "false" to skip downloading / re-combining the latest datasets.' | |
default: 'true' | |
repository_dispatch: | |
env: | |
# Used by python code that reports errors to sentry. | |
SENTRY_DSN: ${{ secrets.SENTRY_DSN }} | |
SENTRY_ENVIRONMENT: 'production' | |
# use a webhook to write to slack channel dev-alerts for QA | |
SLACK_DEV_ALERTS_WEBHOOK: ${{ secrets.SLACK_DEV_ALERTS_WEBHOOK }} | |
DATA_AVAILABILITY_SHEET_NAME: "Data Availability" | |
GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA: ${{ secrets.GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA }} | |
# Use trigger_api_build if specified, else use true. This automatically triggers | |
# the API build on scheduled runs where trigger_api_build is not defined. | |
# https://github.community/t/how-can-you-use-expressions-as-the-workflow-dispatch-input-default/141454/4 | |
TRIGGER_API_BUILD: ${{ github.event.inputs.trigger_api_build || 'true' }} | |
REFRESH_DATASETS_ARG: ${{ (github.event.inputs.refresh_datasets == 'true') && '--refresh-datasets' || '--no-refresh-datasets' }} | |
# The GCE instance to start / stop before / after running the job. | |
GCE_ZONE: "us-west1-b" | |
GCE_INSTANCE: "can-actions-runner" | |
jobs: | |
start-runner: | |
runs-on: ubuntu-latest | |
steps: | |
- id: "auth" | |
uses: "google-github-actions/auth@v1" | |
with: | |
credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}" | |
- name: "Set up Cloud SDK" | |
uses: "google-github-actions/setup-gcloud@v1" | |
- name: "Start ${{env.GCE_INSTANCE}} VM." | |
run: "gcloud compute instances start --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}" | |
update-and-promote-datasets: | |
needs: "start-runner" | |
runs-on: gce-runner | |
steps: | |
- name: Parse covid data model branch name and set env variable | |
run: | | |
echo "COVID_DATA_MODEL_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV | |
- name: Checkout covid-data-model | |
uses: actions/checkout@v2 | |
with: | |
repository: act-now-coalition/covid-data-model | |
path: covid-data-model | |
ref: '${{env.COVID_DATA_MODEL_REF}}' | |
lfs: true | |
- name: Update NYTimes anomalies file | |
working-directory: ./covid-data-model | |
run: | | |
curl https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/anomalies.csv --output data/nyt_anomalies.csv | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '3.7' | |
- name: Cache Pip | |
uses: actions/cache@v1 | |
with: | |
path: ~/.cache/pip | |
key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} | |
restore-keys: | | |
${{ runner.os }}-pip- | |
${{ runner.os }}- | |
- name: Install Dependencies | |
working-directory: ./covid-data-model | |
run: pip install -r requirements.txt | |
- name: prune covid-data-model | |
working-directory: ./covid-data-model | |
run: git lfs prune | |
- name: Update and Promote dataset. | |
working-directory: ./covid-data-model | |
run: | | |
./run.py data update ${{env.REFRESH_DATASETS_ARG}} | |
- name: Create Update Commit | |
working-directory: ./covid-data-model | |
run: ./tools/push-data-update.sh | |
- name: Maybe Trigger API build | |
if: env.TRIGGER_API_BUILD == 'true' | |
working-directory: ./covid-data-model | |
env: | |
GITHUB_TOKEN: ${{ secrets.CAN_ROBOT_PERSONAL_ACCESS_TOKEN }} | |
run: | | |
./tools/build-snapshot.sh main | |
- name: Slack notification | |
if: env.TRIGGER_API_BUILD == 'true' | |
env: | |
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }} | |
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3 | |
with: | |
args: 'Started new API build from dataset updater action.' | |
# TODO(https://trello.com/c/4dFFtQiH/1239-fix-data-availability-report-or-officially-replace-it-with-toms-dashboard) | |
# - name: Update Data Availability Sheet | |
# working-directory: ./covid-data-model | |
# run: | | |
# ./run.py data update-availability-report | |
- name: Slack notification | |
if: failure() | |
env: | |
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }} | |
STATUS: ${{job.status}} | |
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3 | |
with: | |
args: 'update-dataset-snapshot failed' | |
- name: Slack notification | |
if: success() | |
env: | |
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }} | |
STATUS: ${{job.status}} | |
DATA_AVAILABILITY_URL: http://tiny.cc/can-data | |
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3 | |
with: | |
args: 'update-dataset-snapshot succeeded. View Data Availability Report at {{DATA_AVAILABILITY_URL}}' | |
stop-runner: | |
if: ${{ always() }} | |
needs: ["start-runner", "update-and-promote-datasets"] | |
runs-on: ubuntu-latest | |
steps: | |
- id: "auth" | |
uses: "google-github-actions/auth@v1" | |
with: | |
credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}" | |
- name: "Set up Cloud SDK" | |
uses: "google-github-actions/setup-gcloud@v1" | |
- name: "Stop ${{env.GCE_INSTANCE}} VM." | |
run: "gcloud compute instances stop --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}" |