From b77ea1eeeb0e6f2b3c672ecaebb0a1f18b24927d Mon Sep 17 00:00:00 2001 From: Sean Browning Date: Thu, 9 Mar 2023 19:47:13 +0000 Subject: [PATCH 01/10] Updating OWID lookup --- R/lookups.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/lookups.R b/R/lookups.R index 20e8cd6..6de1e45 100644 --- a/R/lookups.R +++ b/R/lookups.R @@ -139,7 +139,9 @@ manual_iso3_lk <- list( #' to be updated as needed. datasource_lk <- list( # OWID cases and deaths - owid_all = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv", + # SB Note: Beginning Mar 8, 2023 OWID has ceased pulling from JHU + # and JHU will cease operations itself on Mar 10, 2023. This will contain legacy data + owid_all = "https://covid.ourworldindata.org/data/owid-covid-data-old.csv", # OWID Testing dataset owid_testing = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-all-observations.csv", # Testing data and metadata from FIND From aea32c55011bc734f52882b1fd15d8e25b72ed0e Mon Sep 17 00:00:00 2001 From: Sean Browning Date: Thu, 9 Mar 2023 21:10:47 +0000 Subject: [PATCH 02/10] Adding HK and Taiwan data from primary sources - Updating get_covid_df to pull HK and Taiwan data from primary source - Seems to align with JHU closely --- R/get_covid_sources.R | 113 ++++++++++++++++++++++++++++++++++++++++-- R/lookups.R | 5 ++ 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/R/get_covid_sources.R b/R/get_covid_sources.R index a637e60..a71943a 100644 --- a/R/get_covid_sources.R +++ b/R/get_covid_sources.R @@ -4,19 +4,19 @@ #' @description Get and prepare COVID data. #' #' Pull in current case and death counts from WHO source. -#' For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from John Hopkins source. +#' For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from primary sources. #' #' #' @return Returns a data frame with n rows and 8 columns, including: #' \itemize{ #' \item{\code{date}}{ date Date of observation} #' \item{\code{iso2code}}{ character ISO 3166-1 alpha-2 country code} -#' \item{\code{country}}{ character WHO/JHU english country name} +#' \item{\code{country}}{ character WHO english country name} #' \item{\code{new_cases}}{ integer Number of new cases reported on date} #' \item{\code{cumulative_cases}}{ integer Number of cumulative cases to date} #' \item{\code{new_deaths}}{ integer Number of new deaths reported on date} #' \item{\code{cumulative_deaths}}{ integer Number of cumulative deaths to date} -#' \item{\code{source}}{ character Data Source (JHU, WHO)} +#' \item{\code{source}}{ character Data Source} #' } #' @import dplyr #' @importFrom data.table fread @@ -100,7 +100,112 @@ get_covid_df <- function() { ) %>% arrange(country, date) - df <- bind_rows(who_data, jhu_data) + hk_data <- get_hk_data() + tw_data <- get_taiwan_data() + df <- bind_rows(who_data, jhu_data, hk_data, tw_data) return(df) } + +get_hk_data <- function() { + hk_data_raw <- fread(datasource_lk$hk_case_deaths, stringsAsFactors = FALSE, encoding = "UTF-8", data.table = FALSE) |> + as_tibble() + + hk_data_raw[["pcr_and_rat"]] <- rowSums( + hk_data_raw[, c("Number of cases tested positive for SARS-CoV-2 virus by nucleic acid tests", "Number of cases tested positive for SARS-CoV-2 virus by rapid antigen tests")], + na.rm = TRUE + ) + + hk_data <- hk_data_raw |> + mutate( + date = as.Date(`As of date`, "%d/%m/%Y"), + iso2code = "HK", + country = "Hong Kong", + source = "HK CHP", + # Number of confirmed cases used to be used + # prior to Omicron wave, but was replaced by + # the two other vars that stratified by PCR or RAT pos + cumulative_cases = case_when( + !is.na(`Number of confirmed cases`) ~ as.numeric(`Number of confirmed cases`), + pcr_and_rat != 0 ~ pcr_and_rat, + TRUE ~ NA_real_ + ) + ) |> + rename(cumulative_deaths = `Number of death cases`) |> + # Cumultive case reporting stopped for some reason + # so we need to fill downwards to continue it + arrange(date) |> + tidyr::fill(cumulative_cases, cumulative_deaths) |> + mutate( + # Started tracking new deaths via this variable in Jan 2023 + cumulative_deaths = if_else( + !is.na(`Number of death cases related to COVID-19`), + cumulative_deaths + cumsum(tidyr::replace_na(`Number of death cases related to COVID-19`, 0)), + cumulative_deaths + ), + # Started tracking new cases via this variable in Jan 2023 + cumulative_cases = if_else( + !is.na(`Number of positive nucleic acid test laboratory detections`), + cumulative_cases + cumsum(tidyr::replace_na(`Number of positive nucleic acid test laboratory detections`, 0)), + cumulative_cases + ), + new_cases = cumulative_cases - lag(cumulative_cases, default = 0), + new_deaths = cumulative_deaths - lag(cumulative_deaths, default = 0) + ) |> + select(date, iso2code, country, new_cases, cumulative_cases, new_deaths, cumulative_deaths, source) + + return(hk_data) +} + +get_taiwan_data <- function() { + tw_case_raw <- data.table::fread( + datasource_lk$taiwan_cases, + encoding = "UTF-8", + data.table = FALSE + ) + + tw_death_raw <- data.table::fread( + datasource_lk$taiwan_deaths, + encoding = "UTF-8", + data.table = FALSE + ) + + tw_cases <- tw_case_raw |> + rename( + date = `個案研判日`, + cases = `確定病例數` + ) |> + mutate( + date = as.Date(date, "%Y/%m/%d") + ) |> + group_by(date) |> + summarise( + new_cases = sum(cases, na.rm = T) + ) |> + ungroup() |> + arrange(date) |> + mutate(cumulative_cases = cumsum(new_cases)) + + tw_deaths <- tw_death_raw |> + rename( + date = `發病日`, + deaths = `死亡病例數` + ) |> + mutate(date = as.Date(date, "%Y/%m/%d")) |> + group_by(date) |> + summarise(new_deaths = sum(deaths, na.rm = T)) |> + arrange(date) |> + mutate(cumulative_deaths = cumsum(new_deaths)) + + tw_data <- full_join( + tw_cases, tw_deaths, + by = "date" + ) |> + mutate( + iso2code = "TW", + country = "Taiwan", + source = "Taiwan CDC" + ) + + return(tw_data) +} diff --git a/R/lookups.R b/R/lookups.R index 6de1e45..0944361 100644 --- a/R/lookups.R +++ b/R/lookups.R @@ -142,6 +142,11 @@ datasource_lk <- list( # SB Note: Beginning Mar 8, 2023 OWID has ceased pulling from JHU # and JHU will cease operations itself on Mar 10, 2023. This will contain legacy data owid_all = "https://covid.ourworldindata.org/data/owid-covid-data-old.csv", + # HK Cases and Deaths + hk_case_deaths = "http://www.chp.gov.hk/files/misc/latest_situation_of_reported_cases_covid_19_eng.csv", + # Taiwan Cases and Deaths + taiwan_cases = "https://data.cdc.gov.tw/en/download?resourceid=a65c7cb5-8a3c-4859-a27a-9019f65dd66e&dataurl=https://od.cdc.gov.tw/eic/Day_Confirmation_Age_County_Gender_19CoV.csv", + taiwan_deaths = "https://data.cdc.gov.tw/en/download?resourceid=a12dfeba-0dea-4b3f-b1b0-1bf3524b3ca9&dataurl=https://od.cdc.gov.tw/eic/open_data_death_date_statistics_19CoV_5.csv", # OWID Testing dataset owid_testing = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-all-observations.csv", # Testing data and metadata from FIND From 4baccc9504eaea197fd7bd533e11644ca6b41bac Mon Sep 17 00:00:00 2001 From: Sean Browning <25888380+beansrowning@users.noreply.github.com> Date: Thu, 9 Mar 2023 17:02:03 -0500 Subject: [PATCH 03/10] Fixing weird coercion error with int/double --- R/get_covid_sources.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/get_covid_sources.R b/R/get_covid_sources.R index a71943a..31d7c6a 100644 --- a/R/get_covid_sources.R +++ b/R/get_covid_sources.R @@ -140,14 +140,14 @@ get_hk_data <- function() { # Started tracking new deaths via this variable in Jan 2023 cumulative_deaths = if_else( !is.na(`Number of death cases related to COVID-19`), - cumulative_deaths + cumsum(tidyr::replace_na(`Number of death cases related to COVID-19`, 0)), - cumulative_deaths + as.double(cumulative_deaths + cumsum(tidyr::replace_na(`Number of death cases related to COVID-19`, 0))), + as.double(cumulative_deaths) ), # Started tracking new cases via this variable in Jan 2023 cumulative_cases = if_else( !is.na(`Number of positive nucleic acid test laboratory detections`), - cumulative_cases + cumsum(tidyr::replace_na(`Number of positive nucleic acid test laboratory detections`, 0)), - cumulative_cases + as.double(cumulative_cases + cumsum(tidyr::replace_na(`Number of positive nucleic acid test laboratory detections`, 0))), + as.double(cumulative_cases) ), new_cases = cumulative_cases - lag(cumulative_cases, default = 0), new_deaths = cumulative_deaths - lag(cumulative_deaths, default = 0) From d04e86b8326e780d8c68a20326f0a4b6628a8df5 Mon Sep 17 00:00:00 2001 From: Sean Browning <25888380+beansrowning@users.noreply.github.com> Date: Fri, 10 Mar 2023 10:21:28 -0500 Subject: [PATCH 04/10] Fixing unicode issue on read in for testing data --- R/get_testing.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_testing.R b/R/get_testing.R index 9e719d0..1ad9e62 100644 --- a/R/get_testing.R +++ b/R/get_testing.R @@ -68,7 +68,7 @@ get_owid_testing_long <- function(find_maxgap = 31, flag_test_increase = 5) { stop("Check testing dataset in get_testing_long() -- multiple values per country-date") } - full_OWID <- data.table::fread(datasource_lk$owid_all, data.table = F, showProgress = F, verbose = F) %>% + full_OWID <- data.table::fread(datasource_lk$owid_all, data.table = F, showProgress = F, verbose = F, encoding = "UTF-8") %>% rename(id = iso_code) %>% mutate(date = as.Date(date)) %>% mutate(id = recode(id, "OWID_KOS" = "XKX")) %>% From 9b53215a9092ea864197a9bc247d04c697f630e7 Mon Sep 17 00:00:00 2001 From: Sean Browning <25888380+beansrowning@users.noreply.github.com> Date: Fri, 10 Mar 2023 12:32:42 -0500 Subject: [PATCH 05/10] Re-working how get_covid_df() works - Splitting all sources into their own getter functions - Adding "source" argument, which defaults to pulling "all" as it has before, but also allow for specific data cuts - Updating docs --- DESCRIPTION | 2 +- R/get_covid_sources.R | 53 ++++++++++++++++++++++++++++++++++++------- man/datasource_lk.Rd | 2 +- man/get_covid_df.Rd | 16 +++++++++---- 4 files changed, 59 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index dcc4b8b..22dae07 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,7 @@ Imports: bit64 Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 Roxygen: list(markdown = TRUE) Suggests: rmarkdown, diff --git a/R/get_covid_sources.R b/R/get_covid_sources.R index 31d7c6a..44e0ac2 100644 --- a/R/get_covid_sources.R +++ b/R/get_covid_sources.R @@ -4,8 +4,14 @@ #' @description Get and prepare COVID data. #' #' Pull in current case and death counts from WHO source. -#' For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from primary sources. +#' For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from JHU or primary sources. #' +#' @param sources one of "all", "WHO", "WHO+JHU", "WHO+Primary" specifying the data sources to pull from. See details. +#' +#' @details +#' In legacy versions, the default was to pull "all" sources, which included the WHO case/death time-series and JHU data for China Mainland, HK, Macau, and Taiwan. +#' Due to sun-setting and changes in reporting, we now capture HK and Taiwan data from primary sources ("WHO+Primary"). Note that this also includes JHU data on Macau +#' which will be reported thru Mar 10, 2023 when JHU closes their dashboard. #' #' @return Returns a data frame with n rows and 8 columns, including: #' \itemize{ @@ -22,7 +28,38 @@ #' @importFrom data.table fread #' @export -get_covid_df <- function() { +get_covid_df <- function(sources = c("all", "WHO", "WHO+JHU", "WHO+Primary")) { + sources <- match.arg(sources) + + out <- get_who_data() + + if (sources == "WHO") { + return(out) + } + + jhu_data <- get_jhu_data() + out <- bind_rows(out, jhu_data) + + if (sources == "WHO+JHU") { + return(out) + } + + hk_data <- get_hk_data() + tw_data <- get_taiwan_data() + + out <- bind_rows(out, hk_data, tw_data) + + # Keep only Macau data from JHU if we want primary sources + WHO + # else, keep all of it + if (sources %in% c("WHO+Primary")) { + out <- out |> + filter(!(source == "JHU" & country %in% c("Hong Kong", "China", "Taiwan"))) + } + + return(out) +} + +get_who_data <- function() { who_data <- fread(datasource_lk$who_all, stringsAsFactors = FALSE, encoding = "UTF-8") %>% rename_all(tolower) %>% rename(iso2code = country_code) %>% @@ -42,7 +79,11 @@ get_covid_df <- function() { source = "WHO" ) %>% select(-who_region) + + return(who_data) +} +get_jhu_data <- function() { jhu_cases <- fread(datasource_lk$jhu_case, stringsAsFactors = FALSE, check.names = FALSE) %>% rename_all(tolower) %>% filter(`country/region` %in% c("Taiwan*", "China")) %>% @@ -99,12 +140,8 @@ get_covid_df <- function() { source = "JHU" ) %>% arrange(country, date) - - hk_data <- get_hk_data() - tw_data <- get_taiwan_data() - df <- bind_rows(who_data, jhu_data, hk_data, tw_data) - - return(df) + + return(jhu_data) } get_hk_data <- function() { diff --git a/man/datasource_lk.Rd b/man/datasource_lk.Rd index d984f01..5d123ce 100644 --- a/man/datasource_lk.Rd +++ b/man/datasource_lk.Rd @@ -6,7 +6,7 @@ \title{A list of all data sources used in the package to be updated as needed.} \format{ -An object of class \code{list} of length 16. +An object of class \code{list} of length 19. } \usage{ datasource_lk diff --git a/man/get_covid_df.Rd b/man/get_covid_df.Rd index 262932b..d8ae54d 100644 --- a/man/get_covid_df.Rd +++ b/man/get_covid_df.Rd @@ -4,24 +4,32 @@ \alias{get_covid_df} \title{get_covid_df} \usage{ -get_covid_df() +get_covid_df(sources = c("all", "WHO", "WHO+JHU", "WHO+Primary")) +} +\arguments{ +\item{sources}{one of "all", "WHO", "WHO+JHU", "WHO+Primary" specifying the data sources to pull from. See details.} } \value{ Returns a data frame with n rows and 8 columns, including: \itemize{ \item{\code{date}}{ date Date of observation} \item{\code{iso2code}}{ character ISO 3166-1 alpha-2 country code} -\item{\code{country}}{ character WHO/JHU english country name} +\item{\code{country}}{ character WHO english country name} \item{\code{new_cases}}{ integer Number of new cases reported on date} \item{\code{cumulative_cases}}{ integer Number of cumulative cases to date} \item{\code{new_deaths}}{ integer Number of new deaths reported on date} \item{\code{cumulative_deaths}}{ integer Number of cumulative deaths to date} -\item{\code{source}}{ character Data Source (JHU, WHO)} +\item{\code{source}}{ character Data Source} } } \description{ Get and prepare COVID data. Pull in current case and death counts from WHO source. -For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from John Hopkins source. +For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from JHU or primary sources. +} +\details{ +In legacy versions, the default was to pull "all" sources, which included the WHO case/death time-series and JHU data for China Mainland, HK, Macau, and Taiwan. +Due to sun-setting and changes in reporting, we now capture HK and Taiwan data from primary sources ("WHO+Primary"). Note that this also includes JHU data on Macau +which will be reported thru Mar 10, 2023 when JHU closes their dashboard. } From d011af042ca5c53c444ccd4d6bae5ba8639a1f99 Mon Sep 17 00:00:00 2001 From: Sean Browning <25888380+beansrowning@users.noreply.github.com> Date: Fri, 10 Mar 2023 13:14:17 -0500 Subject: [PATCH 06/10] Updating get_combined_table to allow for new data sources - Using switch() with the new get_covid_df() function streamlines things a bit --- R/get_combined_table.R | 62 +++++++++++++++++++++++++-------------- man/get_combined_table.Rd | 30 +++++++++++-------- 2 files changed, 58 insertions(+), 34 deletions(-) diff --git a/R/get_combined_table.R b/R/get_combined_table.R index fe6fa90..7f035f5 100644 --- a/R/get_combined_table.R +++ b/R/get_combined_table.R @@ -1,36 +1,61 @@ #' @title A function to retrieve a dataframe (df) with combined Case/Death/Vaccine data by country #' -#' @param type (character) Specifies whether df should include disaggregated China data ("Both" separates China, Taiwan, Hong Kong, and Macau data) or combined China data ("WHO" combines China, Taiwan, Hong Kong, and Macau data as China) +#' @param type (character) Specifies what data streams to include for case/death data. See details for further information +#' whether df should include disaggregated China data ("Both" separates China, Taiwan, Hong Kong, and Macau data) or combined China data ("WHO" combines China, Taiwan, Hong Kong, and Macau data as China) #' @param geometry (logical, default: FALSE) Specifies whether df should include the geometry column #' #' @returns Returns an object of class \code{data.frame} with n rows and 56(57, if \code{geometry = TRUE}) columns #' #' @seealso [get_covid_df()], [get_vax()], and [calc_add_risk()] for full column data documentation #' +#' @details +#' The `type` argument used to take two values: "WHO" and "Both", referring to whether to take WHO data as-is, or to supplement WHO data with disaggregated China data from JHU. +#' In early Jan 2023, China CDC ceased providing daily COVID-19 updates, so the Mainland China data provided by JHU also stopped. On Mar 10, 2023 JHU closed their dashboard entirely, +#' so new sources had to be located for HK, Macau, and Taiwan data. +#' +#' For legacy analyses, the old behavior for "Both" is now available as "legacy" +#' +#' The new "Both" type pulls data from HK CHP, Taiwan CDC, and JHU (for Macau data thru Mar 10) in addition to the China data in WHO (which also includes Taiwan, HK, and Macau data). +#' Because data from HK and Taiwan are duplicated in this way, you should not use data from the "Both" option to compute regional or global trends. #' @examples #' \dontrun{ #' # Get the df that combines China with Taiwan, Hong Kong, and Macau data #' df_who <- get_combined_table("WHO") #' print(df_who) -#' # Get the df that uses both disagreggated China, Taiwan, Hong Kong, and Macau data (WHO + JHU= "Both") +#' # Get the df that combines WHO China data (aggregated) with disggregated entries for HK, Taiwan, and Macau (from JHU thru Mar 10, 2023) #' df_both <- get_combined_table("Both") #' print(df_both) -#' -#' # get_combined_table() is identical to the following sequence: -#' onetable %>% -#' select(-geometry) %>% # In the case that geometry = FALSE -#' right_join(get_covid_df(), by = "iso2code") %>% -#' filter(source == "WHO") %>% # In the case of type = "WHO" -#' # filter(!(country == "China" & source == "WHO")) %>% # In the case of type = "Both" -#' calc_add_risk() %>% -#' left_join(get_vax(), by = c("id", "date")) +#' # Get the df that uses both disaggregated China, Taiwan, Hong Kong, and Macau data (WHO + JHU = "legacy") +#' # (JHU sunset on Mar 10, 2023 and China mainland data ceased earlier in the year) +#' df_both <- get_combined_table("legacy") +#' print(df_both) #' } +#' @md #' @export -get_combined_table <- function(type = c("WHO", "Both"), geometry = FALSE) { +get_combined_table <- function(type = c("WHO", "Both", "legacy"), geometry = FALSE) { type <- match.arg(type) + + case_death_df <- switch( + type, + WHO = get_covid_df("WHO"), + Both = get_covid_df("WHO+Primary"), + legacy = get_covid_df("WHO+JHU") + ) + + if (type == "legacy") { + # How "Both" used to work before data stopped flowing in: + # - WHO data for everything except for China where we use JHU to replace + # china mainland data, HK, Macau, and Taiwan. + # As of 3/10/2023, these data won't be updated by JHU, and China mainland data + # haven't been updating since early Jan 2023 in JHU. + # But I'll leave in for historical analyses. + case_death_df <- filter( + case_death_df, + !(country == "China" & source == "WHO") + ) + } - case_death_df <- get_covid_df() vax_df <- get_vax() meta_df <- onetable @@ -45,17 +70,10 @@ get_combined_table <- function(type = c("WHO", "Both"), geometry = FALSE) { out <- meta_df %>% right_join(case_death_df, by = "iso2code") - # If we want Taiwan / HK / Macau, remove china estimates - # and keep JHU - if (type == "Both") { - out <- filter(out, !(country == "China" & source == "WHO")) - } else { - # If we only want WHO data, remove the JHU rows - out <- filter(out, source == "WHO") - } - out <- out %>% calc_add_risk() %>% + # BUG: I'm not sure we want this as a left_join + # but I don't want to break everything by switching it to full left_join(vax_df, by = c("id", "date")) %>% calc_vax_carryforward() diff --git a/man/get_combined_table.Rd b/man/get_combined_table.Rd index ac67bdd..729fb54 100644 --- a/man/get_combined_table.Rd +++ b/man/get_combined_table.Rd @@ -4,10 +4,11 @@ \alias{get_combined_table} \title{A function to retrieve a dataframe (df) with combined Case/Death/Vaccine data by country} \usage{ -get_combined_table(type = c("WHO", "Both"), geometry = FALSE) +get_combined_table(type = c("WHO", "Both", "legacy"), geometry = FALSE) } \arguments{ -\item{type}{(character) Specifies whether df should include disaggregated China data ("Both" separates China, Taiwan, Hong Kong, and Macau data) or combined China data ("WHO" combines China, Taiwan, Hong Kong, and Macau data as China)} +\item{type}{(character) Specifies what data streams to include for case/death data. See details for further information +whether df should include disaggregated China data ("Both" separates China, Taiwan, Hong Kong, and Macau data) or combined China data ("WHO" combines China, Taiwan, Hong Kong, and Macau data as China)} \item{geometry}{(logical, default: FALSE) Specifies whether df should include the geometry column} } @@ -17,23 +18,28 @@ Returns an object of class \code{data.frame} with n rows and 56(57, if \code{geo \description{ A function to retrieve a dataframe (df) with combined Case/Death/Vaccine data by country } +\details{ +The \code{type} argument used to take two values: "WHO" and "Both", referring to whether to take WHO data as-is, or to supplement WHO data with disaggregated China data from JHU. +In early Jan 2023, China CDC ceased providing daily COVID-19 updates, so the Mainland China data provided by JHU also stopped. On Mar 10, 2023 JHU closed their dashboard entirely, +so new sources had to be located for HK, Macau, and Taiwan data. + +For legacy analyses, the old behavior for "Both" is now available as "legacy" + +The new "Both" type pulls data from HK CHP, Taiwan CDC, and JHU (for Macau data thru Mar 10) in addition to the China data in WHO (which also includes Taiwan, HK, and Macau data). +Because data from HK and Taiwan are duplicated in this way, you should not use data from the "Both" option to compute regional or global trends. +} \examples{ \dontrun{ # Get the df that combines China with Taiwan, Hong Kong, and Macau data df_who <- get_combined_table("WHO") print(df_who) -# Get the df that uses both disagreggated China, Taiwan, Hong Kong, and Macau data (WHO + JHU= "Both") +# Get the df that combines WHO China data (aggregated) with disggregated entries for HK, Taiwan, and Macau (from JHU thru Mar 10, 2023) df_both <- get_combined_table("Both") print(df_both) - -# get_combined_table() is identical to the following sequence: -onetable \%>\% - select(-geometry) \%>\% # In the case that geometry = FALSE - right_join(get_covid_df(), by = "iso2code") \%>\% - filter(source == "WHO") \%>\% # In the case of type = "WHO" - # filter(!(country == "China" & source == "WHO")) \%>\% # In the case of type = "Both" - calc_add_risk() \%>\% - left_join(get_vax(), by = c("id", "date")) +# Get the df that uses both disaggregated China, Taiwan, Hong Kong, and Macau data (WHO + JHU = "legacy") +# (JHU sunset on Mar 10, 2023 and China mainland data ceased earlier in the year) +df_both <- get_combined_table("legacy") +print(df_both) } } \seealso{ From 80ebb8df83dbe96c576f220f9c4f148b5f041710 Mon Sep 17 00:00:00 2001 From: Sean Browning Date: Fri, 10 Mar 2023 18:47:17 +0000 Subject: [PATCH 07/10] Using this more friendly link to OWID historical data --- R/lookups.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/lookups.R b/R/lookups.R index 0944361..12b8a6d 100644 --- a/R/lookups.R +++ b/R/lookups.R @@ -141,7 +141,7 @@ datasource_lk <- list( # OWID cases and deaths # SB Note: Beginning Mar 8, 2023 OWID has ceased pulling from JHU # and JHU will cease operations itself on Mar 10, 2023. This will contain legacy data - owid_all = "https://covid.ourworldindata.org/data/owid-covid-data-old.csv", + owid_all = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data-old.csv", # HK Cases and Deaths hk_case_deaths = "http://www.chp.gov.hk/files/misc/latest_situation_of_reported_cases_covid_19_eng.csv", # Taiwan Cases and Deaths From 3b80098f988af0e4c3af44f308541d437cf06e19 Mon Sep 17 00:00:00 2001 From: Sean Browning Date: Fri, 10 Mar 2023 20:04:43 +0000 Subject: [PATCH 08/10] Updating tests --- tests/testthat/test-combined_table.R | 32 ++++++++++++++++-- tests/testthat/test-data_accessors.R | 49 ++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-combined_table.R b/tests/testthat/test-combined_table.R index ddcf4ce..c5433e1 100644 --- a/tests/testthat/test-combined_table.R +++ b/tests/testthat/test-combined_table.R @@ -21,8 +21,8 @@ test_that("Combined table returns WHO data appropriately", { expect_identical(resulting, comp_shell) }) -test_that("Combined table returns JHU+WHO data appropriately", { - df <- get_combined_table(type = "Both") +test_that("Combined table returns JHU+WHO in a legacy way", { + df <- get_combined_table(type = "legacy") dims <- dim(df) # Should have at least 1 row and 55 cols @@ -49,6 +49,34 @@ test_that("Combined table returns JHU+WHO data appropriately", { expect_identical(resulting_countries, expected_countries) }) +test_that("Combined table returns JHU+HK+Taiwan data appropriately", { + df <- get_combined_table(type = "Both") + dims <- dim(df) + + # Should have at least 1 row and 55 cols + expect_gt(dims[1], 0) + expect_equal(dims[2], 56) + + # Should contain both JHU and WHO data + sources <- unique(df$source) + sources <- sources[order(sources)] + + expect_equal(sources, c("HK CHP", "JHU", "Taiwan CDC", "WHO")) + + # ...thus, should have HK, Macau, Taiwan, and China + expected_countries <- c(onetable_addn_countries$iso2code, "CN") + expected_countries <- expected_countries[order(expected_countries)] + + resulting <- df %>% + filter(iso2code %in% expected_countries) + + resulting_countries <- unique(resulting$iso2code) + resulting_countries <- resulting_countries[order(resulting_countries)] + + expect_gt(nrow(resulting), 0) + expect_identical(resulting_countries, expected_countries) +}) + test_that("Combined table returns geometry if requested (not recommended)", { df <- get_combined_table(geometry = TRUE) diff --git a/tests/testthat/test-data_accessors.R b/tests/testthat/test-data_accessors.R index 2d0e50e..fdc4571 100644 --- a/tests/testthat/test-data_accessors.R +++ b/tests/testthat/test-data_accessors.R @@ -31,14 +31,59 @@ test_that("COVID Vaccination latest dates are accessible", { expect_equal(ncol(df), 6) }) -test_that("COVID Case Data is accessible", { - df <- get_covid_df() +test_that("All COVID Case/Death Data is accessible", { + df <- get_covid_df("all") # Should have at least 1 row and 8 cols expect_gt(nrow(df), 0) expect_equal(ncol(df), 8) }) +test_that("WHO data returns correctly", { + df <- get_covid_df("WHO") + + # Should have at least 1 row and 8 cols + expect_gt(nrow(df), 0) + expect_equal(ncol(df), 8) + + # Should contain only WHO data + sources <- unique(df$source) + sources <- sources[order(sources)] + expect_equal(sources, "WHO") +}) + +test_that("WHO+JHU data returns correctly", { + df <- get_covid_df("WHO+JHU") + + # Should have at least 1 row and 8 cols + expect_gt(nrow(df), 0) + expect_equal(ncol(df), 8) + + # Should contain both JHU and WHO data + sources <- unique(df$source) + sources <- sources[order(sources)] + expect_equal(sources, c("WHO", "JHU")) +}) + +test_that("WHO+Primary data returns correctly", { + df <- get_covid_df("WHO+Primary") + + # Should have at least 1 row and 8 cols + expect_gt(nrow(df), 0) + expect_equal(ncol(df), 8) + + # Should contain all sources + sources <- unique(df$source) + sources <- sources[order(sources)] + expect_equal(sources, c("WHO", "JHU", "Taiwan CDC", "HK CHP")) + + # China data should be from WHO + china_source <- filter(df, iso2code == "CN") |> + distinct(source) + + expect_equal(china_source, "WHO") +}) + test_that("OWID+FIND Time Series data is available", { df <- get_testing_long() From 508b2b730be6efdccebaedc28c9d7838ad3a7204 Mon Sep 17 00:00:00 2001 From: Sean Browning Date: Fri, 10 Mar 2023 22:36:45 +0000 Subject: [PATCH 09/10] Fixing tests --- R/get_covid_sources.R | 20 +++++++++++--------- tests/testthat/test-data_accessors.R | 7 ++++--- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/R/get_covid_sources.R b/R/get_covid_sources.R index 44e0ac2..8c00ac8 100644 --- a/R/get_covid_sources.R +++ b/R/get_covid_sources.R @@ -7,8 +7,8 @@ #' For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from JHU or primary sources. #' #' @param sources one of "all", "WHO", "WHO+JHU", "WHO+Primary" specifying the data sources to pull from. See details. -#' -#' @details +#' +#' @details #' In legacy versions, the default was to pull "all" sources, which included the WHO case/death time-series and JHU data for China Mainland, HK, Macau, and Taiwan. #' Due to sun-setting and changes in reporting, we now capture HK and Taiwan data from primary sources ("WHO+Primary"). Note that this also includes JHU data on Macau #' which will be reported thru Mar 10, 2023 when JHU closes their dashboard. @@ -79,7 +79,7 @@ get_who_data <- function() { source = "WHO" ) %>% select(-who_region) - + return(who_data) } @@ -140,12 +140,12 @@ get_jhu_data <- function() { source = "JHU" ) %>% arrange(country, date) - + return(jhu_data) } get_hk_data <- function() { - hk_data_raw <- fread(datasource_lk$hk_case_deaths, stringsAsFactors = FALSE, encoding = "UTF-8", data.table = FALSE) |> + hk_data_raw <- fread(datasource_lk$hk_case_deaths, stringsAsFactors = FALSE, encoding = "UTF-8", data.table = FALSE, check.names = FALSE) |> as_tibble() hk_data_raw[["pcr_and_rat"]] <- rowSums( @@ -198,13 +198,15 @@ get_taiwan_data <- function() { tw_case_raw <- data.table::fread( datasource_lk$taiwan_cases, encoding = "UTF-8", - data.table = FALSE + data.table = FALSE, + check.names = FALSE ) tw_death_raw <- data.table::fread( datasource_lk$taiwan_deaths, encoding = "UTF-8", - data.table = FALSE + data.table = FALSE, + check.names = FALSE ) tw_cases <- tw_case_raw |> @@ -217,7 +219,7 @@ get_taiwan_data <- function() { ) |> group_by(date) |> summarise( - new_cases = sum(cases, na.rm = T) + new_cases = sum(cases, na.rm = TRUE) ) |> ungroup() |> arrange(date) |> @@ -230,7 +232,7 @@ get_taiwan_data <- function() { ) |> mutate(date = as.Date(date, "%Y/%m/%d")) |> group_by(date) |> - summarise(new_deaths = sum(deaths, na.rm = T)) |> + summarise(new_deaths = sum(deaths, na.rm = TRUE)) |> arrange(date) |> mutate(cumulative_deaths = cumsum(new_deaths)) diff --git a/tests/testthat/test-data_accessors.R b/tests/testthat/test-data_accessors.R index fdc4571..b9a89c7 100644 --- a/tests/testthat/test-data_accessors.R +++ b/tests/testthat/test-data_accessors.R @@ -62,7 +62,7 @@ test_that("WHO+JHU data returns correctly", { # Should contain both JHU and WHO data sources <- unique(df$source) sources <- sources[order(sources)] - expect_equal(sources, c("WHO", "JHU")) + expect_equal(sources, c("JHU", "WHO")) }) test_that("WHO+Primary data returns correctly", { @@ -75,11 +75,12 @@ test_that("WHO+Primary data returns correctly", { # Should contain all sources sources <- unique(df$source) sources <- sources[order(sources)] - expect_equal(sources, c("WHO", "JHU", "Taiwan CDC", "HK CHP")) + expect_equal(sources, c("HK CHP", "JHU", "Taiwan CDC", "WHO")) # China data should be from WHO china_source <- filter(df, iso2code == "CN") |> - distinct(source) + distinct(source) |> + pull() expect_equal(china_source, "WHO") }) From 8185dc08cf885d11494904be2d1ef004f4a92101 Mon Sep 17 00:00:00 2001 From: Sean Browning Date: Fri, 10 Mar 2023 23:15:20 +0000 Subject: [PATCH 10/10] Version bump --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 22dae07..f8693b3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,10 @@ Package: SaviR Type: Package Title: ITF Situational Awareness and Visualization -Version: 0.2.0 +Version: 0.3.0 Authors@R: c( person("Sean", "Browning", email = "sbrowning@cdc.gov", role = c("aut", "cre")), - person("Kimberly", "Wong", email = "nvj5@cdc.gov", role = "aut"), + person("Kimberly", "Lockwood", email = "nvj5@cdc.gov", role = "aut"), person("Nartlada", "Chantharojwong", email = "nartlada@gmail.com", role = "aut"), person("James", "Fuller", role = "aut"), person("Dante", "Bugli", role = "ctb"),