-
Notifications
You must be signed in to change notification settings - Fork 0
/
dash007PMcsvImport.R
124 lines (103 loc) · 4.25 KB
/
dash007PMcsvImport.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
## collections-dashboard-prep
# Prep PM Museum catalogue data
#
# 1) Retrieve latest PM Museum CSV dataset for "All" collections from:
# https://www.PM.museum/collections/objects/data.php
#
# 2) Move zipped CSV to "/data01raw" folder within this project directory.
# 3) Unzip to a folder labelled "PM[YYYYMMDD]" -- e.g., "PM20180121"
# NOTE - Check that PM's CSV-naming convention matches "all-YYYYMMDD.csv"
#
# 3) Run this script
# - NOTE: May need to re-set working directory to folder containing PM csv's
# (see lines 23 & 24)
print(paste(date(), "-- Starting PM data import -- dash007PMcsvImport.R"))
# point to the directory containg the set of "Group" csv's from EMu
setwd(paste0(origdir,"/data01raw/PM20180121/"))
PMList = list.files(pattern="^all.*.csv$")
CatPM01b <- do.call(rbind, lapply(PMList, read.csv, stringsAsFactors = F))
colnames(CatPM01b)[1] <- "irn"
PM_MM <- read.csv(file = "UPMAA_all-data_multimedia.csv",
fileEncoding = "UTF-8",
header = F,
stringsAsFactors = F)
colnames(PM_MM) <- c("irn","HasMM","MultimediaGUID")
CatPM01 <-merge(CatPM01b, PM_MM[1:3], by = "irn", all.x = T)
setwd(paste0(origdir,"/data01raw/emuCat")) # up to /collprep/data01raw/
CatPM02 <- CatPM01[order(CatPM01$irn),]
CatPM02 <- unique(CatPM02)
rm(CatPM01)
# Remove duplicate irn's
CatIRNcount <- NROW(levels(as.factor(CatPM02$irn)))
CatPM02$IRNseq <- sequence(rle(as.character(CatPM02$irn))$lengths)
#CatPM03 <- CatPM02[which(nchar(as.character(CatPM02$DarGlobalUniqueIdentifier)) > 3 & CatPM02$IRNseq == 1),]
CatPM03 <- CatPM02[which(CatPM02$IRNseq == 1),]
CatCheck <- CatPM02[which(CatPM02$IRNseq > 1),]
CatPM03 <- dplyr::select(CatPM03, -IRNseq)
CatPM04 <- data.frame(
"Group1_key" = paste0("PM",CatPM03$irn),
"ecatalogue_key" = "",
"irn" = CatPM03$irn,
"DarGlobalUniqueIdentifier" = CatPM03$url,
"AdmDateInserted" = "",
"AdmDateModified" = "",
"DarImageURL" = "",
"DarIndividualCount" = "1",
"DarBasisOfRecord" = "Artefact",
"DarLatitude" = "",
"DarLongitude" = "",
"DarCountry" = gsub("\\|", " | ", CatPM03$provenience), # check if Where LUT tries to build countries with this; if so, need to not use
"DarContinent" = "",
"DarContinentOcean" = CatPM03$culture_area,
"DarWaterBody" = "",
"DarCollectionCode" = CatPM03$curatorial_section,
"DarEarliestAge" = "",
"DarEarliestEon" = "",
"DarEarliestEpoch" = "",
"DarEarliestEra" = "",
"DarEarliestPeriod" = CatPM03$date_made_early,
"AttPeriod_tab" = CatPM03$period,
"DesEthnicGroupSubgroup_tab" = CatPM03$culture,
"DesMaterials_tab" = CatPM03$material,
"DarOrder" = "",
"DarScientificName" = "",
"ClaRank" = "",
"ComName_tab" = "",
"DarRelatedInformation" = paste(CatPM03$native_name,
CatPM03$description,
CatPM03$technique,
CatPM03$iconography,
sep = " | "),
"CatProject_tab" = paste(CatPM03$accession_credit_line,
CatPM03$creator,
sep = " | "),
"DarYearCollected" = "",
"DarMonthCollected" = "",
"EcbNameOfObject" = CatPM03$object_name,
"CatLegalStatus" = "",
"CatDepartment" = "", # find out what Penn calls it
"DarCatalogNumber" = CatPM03$object_number,
"DarCollector" = "",
"MulHasMultiMedia" = CatPM03$HasMM,
"DarStateProvince" = "",
"DarInstitutionCode" = "PM",
stringsAsFactors = F
)
# # screen duplicate GUIDs ####
# PMcheck <- dplyr::count(CatPM04, DarGlobalUniqueIdentifier)
# PMcheckGUID <- PMcheck[PMcheck$n>1,]
# PMcheckFull <- CatPM04[which(CatPM04$DarGlobalUniqueIdentifier %in% PMcheckGUID$DarGlobalUniqueIdentifier),]
#
# if(NROW(PMcheckGUID)>0) {
# print(paste("Check 'PMcheck' CSVs for these records: ",
# NROW(PMcheckGUID), "duplicate GUIDs in ",
# NROW(PMcheckFull), "PM records"))
# write.csv(PMcheckFull,"PMcheck.csv", row.names = F, na="")
# } else {
# print(paste("No duplicate PM GUIDs; all clear!"))
# }
#
# CatPM05 <- CatPM04[!(CatPM04$DarGlobalUniqueIdentifier %in% PMcheckGUID$DarGlobalUniqueIdentifier),]
# write the lumped/full/single CSV back out
write.csv(CatPM04, file="GroupPM.csv", row.names = F, na="")
setwd(origdir) # up to /collprep/