forked from aphp/eds-pseudo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
project.yml
163 lines (148 loc) · 6.74 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
title: "EDS-Pseudonymisation"
description: |
This project aims at detecting identifying entities at AP-HP's Clinical Data Warehouse:
| Label | Description |
| ---------------- | ------------------------------------------------------------- |
| `ADRESSE` | Street address, eg `33 boulevard de Picpus` |
| `DATE` | Any absolute date other than a birthdate |
| `DATE_NAISSANCE` | Birthdate |
| `HOPITAL` | Hospital name, eg `Hôpital Rothschild` |
| `IPP` | Internal AP-HP identifier for patients, displayed as a number |
| `MAIL` | Email address |
| `NDA` | Internal AP-HP identifier for visits, displayed as a number |
| `NOM` | Any last name (patients, doctors, third parties) |
| `PRENOM` | Any first name (patients, doctors, etc) |
| `SECU` | Social security number |
| `TEL` | Any phone number |
| `VILLE` | Any city |
| `ZIP` | Any zip code |
To run the full pipeline (download, split and format the dataset, train the pipeline and package it), simply run :
```shell
spacy project run all
```
If the pipeline detects that a command has already been run, it skips it unless its inputs have changed.
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
name: "pseudonymisation"
lang: "eds"
version: "0.2.0"
data_folder: "data"
if-dirs-exist: "fail"
dev-percentage: 0.1667
test-percentage: 0
train_xml: "data/train/xml"
train_txt: "data/train/txt"
dev_xml: "data/dev/xml"
dev_txt: "data/dev/txt"
test_xml: "data/test/xml"
test_txt: "data/test/txt"
full_xml: "data/full/xml"
full_txt: "data/full/txt"
corpus: "corpus"
training: "training"
seed: 0
fraction: 200
gpu_id: 0
env:
registry_token: GITLAB_REGISTRY_TOKEN
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories:
["data", "corpus", "configs", "training", "scripts", "packages", "output"]
# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
all:
- split-dataset
- convert
- train
- evaluate
xp:
- convert
- train
- evaluate
- infer
# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
- name: "split-dataset"
help: "split the full dataset into train, dev and test sets, with the specified percentage of the full dataset"
script:
- "python scripts/split_dataset.py --data-folder ${vars.data_folder} --dev-percentage ${vars.dev-percentage} --test-percentage ${vars.test-percentage} --if-dirs-exist ${vars.if-dirs-exist}"
deps:
- "${vars.full_xml}"
- "${vars.full_txt}"
outputs:
- "${vars.train_xml}"
- "${vars.dev_xml}"
- "${vars.test_xml}"
- "${vars.train_txt}"
- "${vars.dev_txt}"
- "${vars.test_txt}"
- name: "convert"
help: "Convert the data to spaCy's binary format with all the needed tag format from the xml file obtained with inception and the plain text"
script:
- "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.train_xml} --txt-path ${vars.train_txt} --output-path ${vars.corpus}/train.spacy --split 'train' "
- "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.dev_xml} --txt-path ${vars.dev_txt} --output-path ${vars.corpus}/dev.spacy --split 'dev' "
- "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.test_xml} --txt-path ${vars.test_txt} --output-path ${vars.corpus}/test.spacy --split 'test' "
- "python scripts/convert.py --lang ${vars.lang} --xml-path ${vars.full_xml} --txt-path ${vars.full_txt} --output-path ${vars.corpus}/full.spacy --split 'full' "
deps:
- "${vars.train_xml}"
- "${vars.dev_xml}"
- "${vars.test_xml}"
- "${vars.full_xml}"
- "${vars.train_txt}"
- "${vars.dev_txt}"
- "${vars.test_txt}"
- "${vars.full_txt}"
outputs:
- "${vars.corpus}/train.spacy"
- "${vars.corpus}/dev.spacy"
- "${vars.corpus}/test.spacy"
- "${vars.corpus}/full.spacy"
- name: "create-config"
help: "Create a new config with an NER pipeline component"
script:
- "python -m spacy init config --lang ${vars.lang} --pipeline ner configs/config.cfg --force --gpu"
outputs:
- "configs/config.cfg"
- name: "train"
help: "Train the NER model"
script:
- "python -m spacy train configs/config.cfg --output ${vars.training} --paths.train ${vars.corpus}/train.spacy --paths.dev ${vars.corpus}/dev.spacy --nlp.lang ${vars.lang} --gpu-id ${vars.gpu_id}"
deps:
- "configs/config.cfg"
- "${vars.corpus}/train.spacy"
- "${vars.corpus}/dev.spacy"
outputs:
- "${vars.training}/model-best"
- name: "evaluate"
help: "Evaluate the model and export metrics"
script:
- "python scripts/evaluate.py ${vars.training}/model-best ${vars.corpus}/test.spacy --output ${vars.training}/test_metrics.json --docbin ${vars.corpus}/output.spacy --gpu-id ${vars.gpu_id}"
deps:
- "${vars.corpus}/test.spacy"
- "${vars.training}/model-best"
outputs:
- "${vars.corpus}/output.spacy"
- "${vars.training}/test_metrics.json"
- name: "infer"
help: "Infer the model on test documents"
script:
- "python scripts/infer.py --model ${vars.training}/model-best --data ${vars.corpus}/full.spacy --output ${vars.corpus}/output.spacy"
deps:
- "${vars.corpus}/full.spacy"
- "${vars.training}/model-best"
outputs:
- "${vars.corpus}/output.spacy"
- name: "package"
help: "Package the trained model as a pip package"
script:
- "python scripts/package.py ${vars.training}/model-best packages --name ${vars.name} --version ${vars.version} --force --build wheel --code eds_pseudonymisation"
deps:
- "${vars.training}/model-best"
outputs_no_cache:
- "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.whl"