-
Notifications
You must be signed in to change notification settings - Fork 0
/
IPTdwc_crystal_prep.R
73 lines (49 loc) · 1.85 KB
/
IPTdwc_crystal_prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Script to prep DwC datasets processed via Crystal Rpts
library('readr')
# Input filepaths for extension & core CSV files
ext_input_file <- "data02output/field_media_mammal_obs.zip"
core_input_file <- "data02output/field_ipt_mammal_obs_raw.zip"
# Corresponding Output filepaths:
ext_output_file <- "data02output/ext_check_ids.csv"
core_output_file <- "data02output/field_ipt_mammal_obs.csv"
# Strip linebreaks
# # Function to check & replace linebreaks
piper <- function (x) {
x[1:NCOL(x)] <- sapply(x[1:NCOL(x)],
function (y) gsub("\\n|\\r", "|", y))
return(x)
}
# Check that all occIDs in extension are in core dataset
ext <- read_csv(ext_input_file,
guess_max = 100000)
input_encoding <- guess_encoding(core_input_file, n_max = 1000000)
core <- read_csv(core_input_file,
guess_max = 1000000,
locale = readr::locale(encoding = input_encoding$encoding[1]))
ext_no_ipt <- ext[which(!ext$occurrenceID %in% core$occurrenceID),]
ext_yes_ipt <- ext[which(ext$occurrenceID %in% core$occurrenceID),]
if (NROW(core) == NROW(unique(core$occurrenceID))) {
print(paste("No duplicate occurrenceIDs found in",
NROW(core), "rows -- Ready for core"))
} else {
occID_check <- dplyr::count(core, occurrenceID)
occID_dups <- occID_check[occID_check$n > 1,]
print("WARNING: duplicate occurrenceIDs found:")
print(occID_dups)
print("See `occID_dups` dataframe for list")
}
# Clean core dataset
core_2 <- piper(core)
# Exports
# Output only the Extension records with id's in Core dataset
write.csv(ext_yes_ipt,
file = ext_output_file,
row.names = FALSE,
quote = TRUE,
na = "")
# Output cleaned DwC dataset
write.csv(core_2,
file = core_output_file,
row.names = FALSE,
quote = TRUE,
na = "")