project.yml

title: "EDS-Pseudonymisation"
description: |
  This project aims at detecting identifying entities at AP-HP's Clinical Data Warehouse:

  | Label            | Description                                                   |
  | ---------------- | ------------------------------------------------------------- |
  | `ADRESSE`        | Street address, eg `33 boulevard de Picpus`                   |
  | `DATE`           | Any absolute date other than a birthdate                      |
  | `DATE_NAISSANCE` | Birthdate                                                     |
  | `HOPITAL`        | Hospital name, eg `Hôpital Rothschild`                        |
  | `IPP`            | Internal AP-HP identifier for patients, displayed as a number |
  | `MAIL`           | Email address                                                 |
  | `NDA`            | Internal AP-HP identifier for visits, displayed as a number   |
  | `NOM`            | Any last name (patients, doctors, third parties)              |
  | `PRENOM`         | Any first name (patients, doctors, etc)                       |
  | `SECU`           | Social security number                                        |
  | `TEL`            | Any phone number                                              |
  | `VILLE`          | Any city                                                      |
  | `ZIP`            | Any zip code                                                  |

  To run the full pipeline (download, split and format the dataset, train the pipeline and package it), simply run :
  ```shell
  spacy project run all
  ```

  If the pipeline detects that a command has already been run, it skips it unless its inputs have changed.

# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  name: "pseudonymisation"
  lang: "eds"
  version: "0.2.0"
  data_folder: "data"
  if-dirs-exist: "fail"
  dev-percentage: 0.1667
  test-percentage: 0
  train_xml: "data/train/xml"
  train_txt: "data/train/txt"
  dev_xml: "data/dev/xml"
  dev_txt: "data/dev/txt"
  test_xml: "data/test/xml"
  test_txt: "data/test/txt"
  full_xml: "data/full/xml"
  full_txt: "data/full/txt"
  corpus: "corpus"
  training: "training"
  seed: 0
  fraction: 200
  gpu_id: 0

env:
  registry_token: GITLAB_REGISTRY_TOKEN

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories:
  ["data", "corpus", "configs", "training", "scripts", "packages", "output"]

# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
  all:
    - split-dataset
    - convert
    - train
    - evaluate
  xp:
    - convert
    - train
    - evaluate
    - infer

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
  - name: "split-dataset"
    help: "split the full dataset into train, dev and test sets, with the specified percentage of the full dataset"
    script:
      - "python scripts/split_dataset.py --data-folder ${vars.data_folder} --dev-percentage ${vars.dev-percentage} --test-percentage ${vars.test-percentage} --if-dirs-exist ${vars.if-dirs-exist}"
    deps:
      - "${vars.full_xml}"
      - "${vars.full_txt}"
    outputs:
      - "${vars.train_xml}"
      - "${vars.dev_xml}"
      - "${vars.test_xml}"
      - "${vars.train_txt}"
      - "${vars.dev_txt}"
      - "${vars.test_txt}"

  - name: "convert"
    help: "Convert the data to spaCy's binary format with all the needed tag format from the xml file obtained with inception and the plain text"
    script:
      - "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.train_xml} --txt-path ${vars.train_txt} --output-path ${vars.corpus}/train.spacy --split 'train' "
      - "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.dev_xml} --txt-path ${vars.dev_txt} --output-path ${vars.corpus}/dev.spacy --split 'dev' "
      - "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.test_xml} --txt-path ${vars.test_txt} --output-path ${vars.corpus}/test.spacy --split 'test' "
      - "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.full_xml} --txt-path ${vars.full_txt} --output-path ${vars.corpus}/full.spacy --split 'full' "

    deps:
      - "${vars.train_xml}"
      - "${vars.dev_xml}"
      - "${vars.test_xml}"
      - "${vars.full_xml}"
      - "${vars.train_txt}"
      - "${vars.dev_txt}"
      - "${vars.test_txt}"
      - "${vars.full_txt}"
    outputs:
      - "${vars.corpus}/train.spacy"
      - "${vars.corpus}/dev.spacy"
      - "${vars.corpus}/test.spacy"
      - "${vars.corpus}/full.spacy"

  - name: "create-config"
    help: "Create a new config with an NER pipeline component"
    script:
      - "python -m spacy init config --lang ${vars.lang} --pipeline ner configs/config.cfg --force --gpu"
    outputs:
      - "configs/config.cfg"

  - name: "train"
    help: "Train the NER model"
    script:
      - "python -m spacy train configs/config.cfg --output ${vars.training} --paths.train ${vars.corpus}/train.spacy --paths.dev ${vars.corpus}/dev.spacy --nlp.lang ${vars.lang} --gpu-id ${vars.gpu_id}"
    deps:
      - "configs/config.cfg"
      - "${vars.corpus}/train.spacy"
      - "${vars.corpus}/dev.spacy"
    outputs:
      - "${vars.training}/model-best"

  - name: "evaluate"
    help: "Evaluate the model and export metrics"
    script:
      - "python scripts/evaluate.py ${vars.training}/model-best ${vars.corpus}/test.spacy --output ${vars.training}/test_metrics.json --docbin ${vars.corpus}/output.spacy --gpu-id ${vars.gpu_id}"
    deps:
      - "${vars.corpus}/test.spacy"
      - "${vars.training}/model-best"
    outputs:
      - "${vars.corpus}/output.spacy"
      - "${vars.training}/test_metrics.json"

  - name: "infer"
    help: "Infer the model on test documents"
    script:
      - "python scripts/infer.py --model ${vars.training}/model-best --data ${vars.corpus}/full.spacy --output ${vars.corpus}/output.spacy"
    deps:
      - "${vars.corpus}/full.spacy"
      - "${vars.training}/model-best"
    outputs:
      - "${vars.corpus}/output.spacy"

  - name: "package"
    help: "Package the trained model as a pip package"
    script:
      - "python scripts/package.py ${vars.training}/model-best packages --name ${vars.name} --version ${vars.version} --force --build wheel --code eds_pseudonymisation"
    deps:
      - "${vars.training}/model-best"
    outputs_no_cache:
      - "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.whl"