Skip to contents

Introduction

This article describes creating derived analysis ready dataset using the PHARMERVERSE workflow. It uses some pre-generated standardized dataset and a metadata-specs as input. Please refer to the following articles to see how those derived dataset and metadata-specs were created, respectively.

In ADNIMERGE2 R data package, the following selected derived analysis ready dataset will be created for illustration purpose.

  • Subject-Level Analysis Dataset - ADSL
  • Analysis Dataset of Adverse Events - ADAE
  • Analysis Dataset of Questionnaire - ADQS
  • Analysis Dataset of Clinical Classification - ADRS

NOTE

  • These derived analysis ready dataset may not be fully complied with the CDISC-ADaM standardization.

Load Required R Packages

# ADNI study R data package
library(ADNIMERGE2)

The following r chuck loads some of study specific wrapper functions that are modified from metacore and metatools R packages and stored in the ADNIMERGE2 R package as system file.

# Load utils function from package system file
utils_file_path <- system.file(
  "analysis-dataset-utils.R",
  package = "ADNIMERGE2",
  mustWork = TRUE
)
source(utils_file_path)

Building Derived Analysis Ready Dataset

Subject-Level Analysis Dataset - ADSL

ADSL dataset contains one record per subject. In the ADNI study, ADSL dataset includes only the first record/baseline characteristics of subject when they were enrolled in the study as newly-enrollee or screened to the study for the first time.

ADSL <- build_from_derived(
  metacore = METACORES,
  dataset_name = "ADSL",
  ds_list = list(
    "DM" = DM,
    "GF" = GF %>%
      group_by(USUBJID) %>%
      filter(row_number() == 1) %>%
      ungroup()
  ),
  predecessor_only = TRUE,
  keep = TRUE
) %>%
  assert_uniq(USUBJID) %>%
  # Create AGE Group - creating categorical group from numeric
  create_cat_var(
    data = .,
    metacore = METACORES,
    ref_var = AGE,
    grp_var = AGEGR1,
    num_grp_var = AGEGR1N
  ) %>%
  # Adjust for multiple race groups
  mutate(RACE = case_when(
    str_detect(RACE, "\\|") ~ "More than one race",
    TRUE ~ as.character(RACE)
  )) %>%
  # Re-code RACE
  create_var_from_codelist(
    data = .,
    metacore = METACORES,
    input_var = RACE,
    out_var = RACEN,
    decode_to_code = TRUE
  ) %>%
  # Add baseline DX
  derive_vars_merged(
    dataset = .,
    dataset_add = RS,
    filter_add = RSTESTCD == "DX" & RSBLFL == "Y",
    new_vars = exprs(DX = RSORRES),
    order = exprs(VISITNUM),
    mode = "last",
    by_vars = exprs(STUDYID, USUBJID),
    check_type = "error",
    relationship = "one-to-one"
  ) %>%
  # Convert to factor with Control Terms (CT)
  convert_var_to_fct_wrapper(
    .data = .,
    metacore = METACORES %>%
      select_dataset(dataset = "ADSL"),
    var = c("ORIGPROT", "DX", "RACE")
  ) %>%
  # Add baseline amyloid status
  call_derivation(
    dataset = .,
    dataset_add = NV,
    derivation = derive_vars_merged,
    variable_params = list(
      params(
        filter_add = NVCAT == "AMYLOIDPET" & NVSCAT == "FBB-6MM" &
          NVTESTCD == "AMYSTAT" & NVBLFL == "Y",
        new_vars = exprs(AMYSTAT = NVSTRESC)
      )
    ),
    by_vars = exprs(USUBJID),
    check_type = "error",
    relationship = "one-to-one"
  ) %>%
  # Add baseline education level and marital status
  call_derivation(
    dataset = .,
    dataset_add = SC,
    derivation = derive_vars_merged,
    variable_params = list(
      params(
        filter_add = SCTESTCD == "PTEDUCAT" & SCBLFL == "Y",
        new_vars = exprs(EDUC = SCSTRESN)
      ),
      params(
        filter_add = SCTESTCD == "PTMARRY" & SCBLFL == "Y",
        new_vars = exprs(MARISTAT = SCORRES)
      )
    ),
    order = exprs(VISITNUM),
    mode = "first",
    by_vars = exprs(STUDYID, USUBJID),
    check_type = "error",
    relationship = "one-to-one"
  ) %>%
  # Add baseline BMI derived from weight and height
  call_derivation(
    dataset = .,
    dataset_add = VS,
    derivation = derive_vars_merged,
    variable_params = list(
      params(
        filter_add = VSTESTCD == "WEIGHT" & VSBLFL == "Y",
        new_vars = exprs(WEIGHT = VSSTRESN)
      ),
      params(
        filter_add = VSTESTCD == "HEIGHT" & VSBLFL == "Y",
        new_vars = exprs(HEIGHT = VSSTRESN)
      )
    ),
    order = exprs(VISITNUM),
    mode = "first",
    by_vars = exprs(USUBJID),
    check_type = "error",
    relationship = "one-to-one"
  ) %>%
  mutate(
    BMI = round(compute_bmi(height = HEIGHT, weight = WEIGHT), 2),
    BMIBLU = ifelse(!is.na(BMI), "kg/m^2", NA_character_),
    AGEU = ifelse(!is.na(AGEU), "Years", NA_character_)
  )
# Add baseline questionnaire/assessment score
params_list <- lapply(unique(QS$QSTESTCD), function(param) {
  params_cond <- admiral::params(
    filter_add = QSTESTCD == !!param & QSBLFL == "Y",
    new_vars = exprs(!!!exprs(param = QSORRES))
  )
  params_cond$new_vars <- substitute(
    exprs(param := as.numeric(QSORRES)),
    list(param = as.name(param))
  )
  return(params_cond)
})

ADSL <- ADSL %>%
  call_derivation(
    dataset = .,
    dataset_add = QS,
    derivation = derive_vars_merged,
    variable_params = params_list,
    mode = "first",
    by_vars = exprs(USUBJID),
    check_type = "error",
    relationship = "one-to-one"
  )
ADSL <- ADSL %>%
  # Create enrollment flag
  derive_var_merged_ef_msrc(
    dataset = .,
    by_vars = exprs(STUDYID, USUBJID),
    flag_events = list(
      flag_event(dataset_name = "DM", condition = !is.na(RFSTDTC))
    ),
    source_datasets = list(DM = DM),
    new_var = ENRLFL,
    true_value = "Y",
    false_value = NA_character_
  ) %>%
  # Imputation for death date and enrollment end date
  # Required partial date/time format/character format
  rename("EOSDTHC" = EOSDT) %>%
  select(-DTHDT) %>%
  mutate(
    ENRLDT = as.Date(ENRLDT),
    EOSDTHC = as.character(EOSDTHC),
    DTHDTC = as.character(DTHDTC)
  ) %>%
  call_derivation(
    dataset = .,
    derivation = derive_vars_dtm,
    variable_params = list(
      params(dtc = DTHDTC, new_vars_prefix = "DTH"),
      params(dtc = EOSDTHC, new_vars_prefix = "EOS")
    ),
    highest_imputation = "M",
    date_imputation = "mid",
    flag_imputation = "auto",
    min_dates = exprs(ENRLDT)
  ) %>%
  derive_vars_dtm_to_dt(
    dataset = .,
    source_vars = exprs(DTHDTM, EOSDTM)
  ) %>%
  # Remove any columns that are not specified in the meta-specs
  drop_unspec_vars(
    dataset = .,
    metacore = METACORES,
    dataset_name = "ADSL"
  ) %>%
  # Add variable labels
  metatools::set_variable_labels(
    data = .,
    metacore = METACORES,
    dataset_name = "ADSL"
  )
ADSL <- ADSL %>%
  # Check all variables specified are present and no more
  check_variables(
    data = .,
    metacore = METACORES,
    dataset_name = "ADSL"
  ) %>%
  # Checks all variables with CT only contain values within the CT
  check_ct_data(
    data = .,
    metacore = METACORES %>%
      select_dataset("ADSL"),
    na_acceptable = TRUE,
    omit_vars = c("ENRLFL", "DTHFL")
  ) %>%
  # Orders the columns according to the spec
  order_cols(
    data = .,
    metacore = METACORES,
    dataset_name = "ADSL"
  ) %>%
  # Sort the dataset based on key columns
  sort_by_key(
    data = .,
    metacore = METACORES,
    dataset_name = "ADSL"
  ) %>%
  # Check uniqueness of record by key columns
  check_unique_keys(
    data = .,
    metacore = METACORES,
    datase = "ADSL"
  )
ADSL <- ADSL %>%
  xportr_metadata(
    .df = .,
    metadata = METACORES %>%
      select_dataset("ADSL"),
    domain = "ADSL",
    verbose = "stop"
  ) %>%
  # Check variable type with specs match
  # xportr_type() %>%
  # Assign variable value length from the meta specs
  xportr_length() %>%
  # Assign variable label from meta specs and
  # checks variable label length for max of 40 characters
  xportr_label() %>%
  # Checks variable format
  xportr_format()

Analysis Dataset of Adverse Events - ADAE

ADAE dataset contains one record per adverse events per subject with the following characteristics.

adae_metacore <- METACORES %>%
  select_dataset(.data = ., dataset = "ADAE", simplify = FALSE)
# Modified user-defined function
ADAE <- build_from_derived(
  metacore = adae_metacore,
  dataset_name = "ADAE",
  ds_list = list("ADSL" = ADSL, "AE" = AE),
  predecessor_only = TRUE,
  keep = FALSE
) %>%
  # Only subject that had at least one adverse events experience
  filter(USUBJID %in% c(AE$USUBJID)) %>%
  verify(nrow(.) == nrow(AE)) %>%
  # Add adverse events onset and ended date
  derive_vars_merged(
    dataset = .,
    dataset_add = AE %>%
      select(USUBJID, AETERM, AESEQ, AESTDTC, AEENDTC),
    by_vars = exprs(USUBJID, AETERM, AESEQ),
    new_vars = NULL,
    check_type = "error",
    relationship = "one-to-one"
  ) %>%
  # AESTDTC and AEENDTC required to be a character variable
  mutate(
    ENRLDT = as.Date(ENRLDT),
    AESTDTC = as.character(AESTDTC),
    AEENDTC = as.character(AEENDTC)
  ) %>%
  # Add imputed date death date from ADSL
  derive_vars_merged(
    dataset = .,
    dataset_add = ADSL %>%
      select(USUBJID, STUDYID, DTHDT),
    by_vars = exprs(USUBJID, STUDYID),
    new_vars = NULL,
    check_type = "error",
    relationship = "many-to-one"
  ) %>%
  # Derive analysis start and end date
  call_derivation(
    dataset = .,
    derivation = derive_vars_dtm,
    variable_params = list(
      params(
        dtc = AESTDTC, new_vars_prefix = "AST", highest_imputation = "M",
        date_imputation = "first", time_imputation = "first"
      ),
      params(
        dtc = AEENDTC, new_vars_prefix = "AEN", highest_imputation = "M",
        date_imputation = "last", time_imputation = "last",
        max_dates = exprs(DTHDT)
      )
    ),
    flag_imputation = "auto",
    min_dates = exprs(ENRLDT)
  ) %>%
  # Convert into date format
  derive_vars_dtm_to_dt(
    dataset = .,
    source_vars = exprs(ASTDTM, AENDTM)
  ) %>%
  # Derive analysis start/end relative day and
  derive_vars_dy(
    reference_date = ENRLDT,
    source_vars = exprs(ASTDT, AENDT)
  ) %>%
  # Derive analysis duration (value and unit)
  derive_vars_duration(
    new_var = ADURN,
    new_var_unit = ADURU,
    start_date = ASTDT,
    end_date = AENDT,
    in_unit = "days",
    out_unit = "days",
    add_one = TRUE,
    trunc_out = FALSE
  ) %>%
  drop_unspec_vars(
    dataset = .,
    metacore = adae_metacore
  ) %>%
  metatools::set_variable_labels(
    data = .,
    metacore = adae_metacore
  )
ADAE <- ADAE %>%
  check_variables(
    data = .,
    metacore = adae_metacore
  ) %>%
  # check_ct_data(
  #   data = .,
  #   metacore = adae_metacore,
  #   na_acceptable = TRUE
  # ) %>%
  order_cols(
    data = .,
    metacore = adae_metacore
  ) %>%
  sort_by_key(
    data = .,
    metacore = adae_metacore
  ) %>%
  check_unique_keys(
    data = .,
    metacore = adae_metacore
  ) %>%
  xportr_metadata(
    .df = .,
    metadata = adae_metacore,
    verbose = "stop"
  ) %>%
  # xportr_type() %>%
  xportr_length() %>%
  xportr_label() %>%
  xportr_format()

Analysis Dataset of Questionnaires - ADQS

ADQS dataset contains one record per questionnaire parameter per visit per subject with the following characteristics.

adqs_metacore <- METACORES %>%
  select_dataset(dataset = "ADQS", simplify = FALSE)
# Assumed all questionnaire parameters are continuous (numeric)
ADQS <- single_build_from_derived(
  metacore = adqs_metacore,
  dataset_name = "ADQS",
  ds_list = list("QS" = QS),
  predecessor_only = TRUE,
  keep = TRUE
) %>%
  # Add variables from ADSL dataset
  derive_vars_merged(
    dataset = .,
    dataset_add = single_build_from_derived(
      metacore = adqs_metacore,
      dataset_name = "ADQS",
      ds_list = list("ADSL" = ADSL),
      predecessor_only = TRUE,
      keep = TRUE
    ),
    by_vars = exprs(USUBJID),
    new_vars = NULL,
    check_type = "error",
    relationship = "many-to-one"
  ) %>%
  verify(nrow(.) == nrow(QS)) %>%
  mutate(ADT = as.Date(ADT)) %>%
  # Re-code PARAMN
  # Assumed all PARAMCD are coded in the meta-specs
  create_var_from_codelist(
    data = .,
    metacore = adqs_metacore,
    input_var = PARAMCD,
    out_var = PARAMN,
    decode_to_code = TRUE
  ) %>%
  # Add analysis timing variable
  derive_vars_dy(
    reference_date = ENRLDT,
    source_vars = exprs(ADT)
  ) %>%
  mutate(
    AVISIT = case_when(
      ADT <= ENRLDT | !is.na(ABLFL) ~ "BASELINE",
      TRUE ~ VISIT
    ),
    AVISITN = case_when(
      ADT <= ENRLDT | !is.na(ABLFL) ~ 0,
      TRUE ~ ADY
    )
  ) %>%
  # # Flag baseline records if the flag is not presented in QS derived dataset
  # restrict_derivation(
  #   derivation = derive_var_extreme_flag,
  #   args = params(
  #     by_vars = exprs(STUDYID, USUBJID, PARAMCD),
  #     order = exprs(ADT),
  #     new_var = ABLFL,
  #     mode = "last"
  #   ),
  #   filter = !is.na(AVAL) & ADT <= ENRLDT
  # ) %>%
  # Derive baseline and change from baseline variables
  # Only applicable for continuous parameters
  derive_var_base(
    by_vars = exprs(STUDYID, USUBJID, PARAMCD),
    source_var = AVAL,
    new_var = BASE
  ) %>%
  # Derive change for post-baseline records
  restrict_derivation(
    derivation = derive_var_chg,
    filter = AVISITN > 0
  ) %>%
  # Derive percentage change for post-baseline records
  restrict_derivation(
    derivation = derive_var_pchg,
    filter = AVISITN > 0
  ) %>%
  # Derive sequence number
  derive_var_obs_number(
    by_vars = exprs(STUDYID, USUBJID, PARAMCD),
    order = exprs(PARAMCD, ADT),
    new_var = ASEQ,
    check_type = "none" # "error"
  ) %>%
  drop_unspec_vars(
    dataset = .,
    metacore = adqs_metacore
  ) %>%
  metatools::set_variable_labels(
    data = .,
    metacore = adqs_metacore
  )
ADQS <- ADQS %>%
  check_variables(
    data = .,
    metacore = adqs_metacore
  ) %>%
  check_ct_data(
    data = .,
    metacore = adqs_metacore,
    na_acceptable = TRUE,
    omit_vars = c("ENRLFL", "ABLFL", "PARAMCD")
  ) %>%
  order_cols(
    data = .,
    metacore = adqs_metacore
  ) %>%
  sort_by_key(
    data = .,
    metacore = adqs_metacore
  ) %>%
  # check_unique_keys(
  #   data = .,
  #   metacore = adqs_metacore
  # ) %>%
  xportr_metadata(
    .df = .,
    metadata = adqs_metacore,
    verbose = "stop"
  ) %>%
  # xportr_type() %>%
  xportr_length() %>%
  xportr_label()

Analysis Dataset of ADAS-Cog Behaviour - ADADAS

Suppose the ADAS Cognitive Behaviors item-13 total score (ADASTT13) was the primary outcome in the analysis, then ADAS-Cog Behavior Analysis Dataset (ADADAS) can be created either directly from ADQS or with a dataset-specific metadata-specs (i.e. adding here). The following chuck illustrate how the ADADAS can be created directly from pre-generated ADQS. More information is available in the Longitudinal Clinical Cognitive Outcome Summaries article.

ADADAS <- ADQS %>%
  filter(PARAMCD == "ADASTT13")

Response Analysis Dataset (ADRS)

ADRS dataset contains subject’s clinical diagnostics summary per visits including death records.

adrs_metacore <- METACORES %>%
  select_dataset(dataset = "ADRS", simplify = FALSE)
# Functions to convert character values into numeric
create_rs_aval <- function(x) {
  x <- dplyr::case_when(
    x %in% "CN" ~ 1,
    x %in% "MCI" ~ 2,
    x %in% "DEM" ~ 3,
    x %in% "DEATH" ~ 4
  )
  return(x)
}
# Merge ADSL to RS
adsl_vars <- exprs(STUDYID, USUBJID, ENRLFL, ENRLDT, DTHFL)

ADRS_PREP <- RS %>%
  filter(RSTESTCD %in% "DX") %>%
  # Adjusting BLFL for subjects that have more than one flag
  group_by(STUDYID, USUBJID, RSTESTCD) %>%
  mutate(NUM_BLFL = sum(!is.na(RSBLFL))) %>%
  ungroup() %>%
  mutate(RSBLFL = case_when(
    NUM_BLFL > 1 & VISITNUM != 1 ~ NA_character_,
    TRUE ~ RSBLFL
  )) %>%
  select(-NUM_BLFL) %>%
  mutate(
    PARAMCD = RSTESTCD,
    PARAM = RSTEST,
    AVALC = RSSTRESC,
    AVAL = create_rs_aval(RSSTRESC),
    ABLFL = RSBLFL,
    ADT = as.Date(RSDTC),
    ADY = RSDY,
    COLPROT = RSGRPID
  ) %>%
  derive_vars_merged(
    dataset = .,
    dataset_add = ADSL,
    new_vars = adsl_vars,
    by_vars = exprs(STUDYID, USUBJID)
  ) %>%
  mutate(
    AVISIT = case_when(
      ADT <= ENRLDT & !is.na(ABLFL) ~ "BASELINE",
      TRUE ~ VISIT
    ),
    AVISITN = case_when(
      ADT <= ENRLDT & !is.na(ABLFL) ~ 0,
      TRUE ~ ADY
    )
  ) %>%
  derive_var_base(
    dataset = .,
    by_vars = exprs(STUDYID, USUBJID, PARAMCD),
    source_var = AVALC
  ) %>%
  select(
    STUDYID, USUBJID, RSSEQ, COLPROT, PARAMCD, PARAM, AVISIT, AVISITN,
    AVALC, AVAL, ABLFL, BASE, ADT, ADY, VISITNUM, VISIT, EPOCH, ENRLFL, DTHFL
  )
# Flag subjects that have baseline diagnostics summary and
## populate the flag per subject's records
ADRS_PREP <- ADRS_PREP %>%
  call_derivation(
    dataset = .,
    dataset_add = .,
    derivation = derive_vars_merged,
    variable_params = list(
      params(
        filter_add = ABLFL == "Y",
        new_vars = exprs(BLBFL = ABLFL)
      )
    ),
    by_vars = exprs(STUDYID, USUBJID),
    check_type = "error",
    relationship = "many-to-one"
  )

# Derive death parameter ----
DAETH_PARAM <- ADSL %>%
  select(!!!adsl_vars, DTHDT) %>%
  # Since month imputations was performed,
  # floor the date to the last date of a year
  mutate(
    DTHDT = ceiling_date(DTHDT, unit = "year") - days(1),
    DHDY = as.numeric(DTHDT - ENRLDT),
    DHDY = ifelse(DHDY == 0, 1, DHDY)
  ) %>%
  derive_var_merged_ef_msrc(
    dataset = .,
    by_vars = exprs(STUDYID, USUBJID),
    flag_events = list(
      flag_event(dataset_name = "ADRS_PREP", condition = BLBFL == "Y")
    ),
    source_datasets = list(ADRS_PREP = ADRS_PREP),
    new_var = BLBFL,
    true_value = "Y",
  )

# Add DEATH parameter ----
ADRS_PREP <- ADRS_PREP %>%
  # Add DEATH parameter
  derive_extreme_records(
    dataset = .,
    dataset_add = DAETH_PARAM,
    dataset_ref = DAETH_PARAM,
    by_vars = exprs(STUDYID, USUBJID),
    filter_add = !is.na(DTHDT),
    check_type = "error",
    exist_flag = AVALC,
    true_value = "DEATH",
    false_value = NA_character_,
    set_values_to = exprs(
      PARAMCD = "DEATH",
      PARAM = "Death",
      AVAL = ifelse(!is.na(DTHDT), 4, NA_real_),
      ADT = DTHDT,
      ADY = DHDY,
      AVISITN = DHDY
    )
  ) %>%
  select(-DTHDT, -DHDY) %>%
  # Remove missing DEATH paramters
  filter(!(PARAMCD %in% "DEATH" & is.na(DTHFL))) %>%
  mutate(PARAMN = case_when(
    PARAMCD == "DX" ~ 1,
    PARAMCD == "DEATH" ~ 2
  ))

# Flag subjects that have at least one followup diagnostics summary after baseline visit
#  Or had death records
ADRS_PREP <- ADRS_PREP %>%
  derive_var_merged_exist_flag(
    dataset = .,
    dataset_add = .,
    by_vars = exprs(STUDYID, USUBJID),
    filter_add = !ABLFL %in% "Y" & !is.na(AVALC) & (!EPOCH %in% "Screening" | AVISITN < 0),
    condition = BLBFL %in% "Y" | is.na(BLBFL),
    new_var = FOLLOWPFL,
    false_value = "N",
    missing_value = "M"
  )

# Identify records that will be used for the analysis: -----
# Enrolled subjects (ENRLFL: "Y"),
# Have a baseline diagnostics summary (BLBFL: "Y"), and
# have at least one followup diagnostics summary/death records (FOLLOWPFL: "Y")
ADRS_PREP <- ADRS_PREP %>%
  mutate(
    ANL01FL = case_when(
      ENRLFL == "Y" & BLBFL == "Y" & FOLLOWPFL == "Y" & !EPOCH %in% "Screening" ~ "Y",
      ENRLFL == "Y" & BLBFL == "Y" & FOLLOWPFL == "Y" & EPOCH %in% "Screening" & ABLFL %in% "Y" ~ "Y"
    )
  ) %>%
  select(-ENRLFL, -ENRLDT, -DTHFL)
ADRS <- ADRS_PREP %>%
  # Merge ADSL ADSL dataset
  derive_vars_merged(
    dataset = .,
    dataset_add = single_build_from_derived(
      metacore = adrs_metacore,
      dataset_name = "ADRS",
      ds_list = list("ADSL" = ADSL),
      predecessor_only = TRUE,
      keep = TRUE
    ),
    by_vars = exprs(STUDYID, USUBJID),
    check_type = "error",
    relationship = "many-to-one"
  ) %>%
  drop_unspec_vars(
    dataset = .,
    metacore = adrs_metacore
  ) %>%
  metatools::set_variable_labels(
    data = .,
    metacore = adrs_metacore
  )
ADRS <- ADRS %>%
  check_variables(
    data = .,
    metacore = adrs_metacore
  ) %>%
  check_ct_data(
    data = .,
    metacore = adrs_metacore,
    na_acceptable = TRUE,
    omit_vars = c("ENRLFL", "ABLFL", "PARAMCD")
  ) %>%
  order_cols(
    data = .,
    metacore = adrs_metacore
  ) %>%
  sort_by_key(
    data = .,
    metacore = adrs_metacore
  ) %>%
  xportr_metadata(
    .df = .,
    metadata = adrs_metacore,
    verbose = "stop"
  ) %>%
  # xportr_type() %>%
  xportr_length() %>%
  xportr_label()