
ADNIMERGE2-Analysis-Data
Last Updated: July 25, 2025
Source:vignettes/ADNIMERGE2-Analysis-Data.Rmd
ADNIMERGE2-Analysis-Data.Rmd
Introduction
This article describes creating derived analysis ready dataset using the PHARMERVERSE workflow. It uses some pre-generated standardized dataset and a metadata-specs as input. Please refer to the following articles to see how those derived dataset and metadata-specs were created, respectively.
In ADNIMERGE2
R data package, the following selected
derived analysis ready dataset will be created for illustration
purpose.
- Subject-Level Analysis Dataset - ADSL
- Analysis Dataset of Adverse Events - ADAE
- Analysis Dataset of Questionnaire - ADQS
- Analysis Dataset of Clinical Classification - ADRS
NOTE
- These derived analysis ready dataset may not be fully complied with the CDISC-ADaM standardization.
Load Required R Packages
library(tidyverse)
library(assertr)
library(metacore)
library(metatools)
library(admiral)
library(admiraldev)
library(xportr)
# ADNI study R data package
library(ADNIMERGE2)
The following r chuck loads some of study specific wrapper functions
that are modified from metacore
and metatools
R packages and stored in the ADNIMERGE2
R package as system
file.
# Load utils function from package system file
utils_file_path <- system.file(
"analysis-dataset-utils.R",
package = "ADNIMERGE2",
mustWork = TRUE
)
source(utils_file_path)
Building Derived Analysis Ready Dataset
Subject-Level Analysis Dataset - ADSL
ADSL
dataset
contains one record per subject. In the ADNI study, ADSL
dataset includes
only the first record/baseline characteristics of subject when they were
enrolled in the study as newly-enrollee or screened to the study for the
first time.
ADSL <- build_from_derived(
metacore = METACORES,
dataset_name = "ADSL",
ds_list = list(
"DM" = DM,
"GF" = GF %>%
group_by(USUBJID) %>%
filter(row_number() == 1) %>%
ungroup()
),
predecessor_only = TRUE,
keep = TRUE
) %>%
assert_uniq(USUBJID) %>%
# Create AGE Group - creating categorical group from numeric
create_cat_var(
data = .,
metacore = METACORES,
ref_var = AGE,
grp_var = AGEGR1,
num_grp_var = AGEGR1N
) %>%
# Adjust for multiple race groups
mutate(RACE = case_when(
str_detect(RACE, "\\|") ~ "More than one race",
TRUE ~ as.character(RACE)
)) %>%
# Re-code RACE
create_var_from_codelist(
data = .,
metacore = METACORES,
input_var = RACE,
out_var = RACEN,
decode_to_code = TRUE
) %>%
# Add baseline DX
derive_vars_merged(
dataset = .,
dataset_add = RS,
filter_add = RSTESTCD == "DX" & RSBLFL == "Y",
new_vars = exprs(DX = RSORRES),
order = exprs(VISITNUM),
mode = "last",
by_vars = exprs(STUDYID, USUBJID),
check_type = "error",
relationship = "one-to-one"
) %>%
# Convert to factor with Control Terms (CT)
convert_var_to_fct_wrapper(
.data = .,
metacore = METACORES %>%
select_dataset(dataset = "ADSL"),
var = c("ORIGPROT", "DX", "RACE")
) %>%
# Add baseline amyloid status
call_derivation(
dataset = .,
dataset_add = NV,
derivation = derive_vars_merged,
variable_params = list(
params(
filter_add = NVCAT == "AMYLOIDPET" & NVSCAT == "FBB-6MM" &
NVTESTCD == "AMYSTAT" & NVBLFL == "Y",
new_vars = exprs(AMYSTAT = NVSTRESC)
)
),
by_vars = exprs(USUBJID),
check_type = "error",
relationship = "one-to-one"
) %>%
# Add baseline education level and marital status
call_derivation(
dataset = .,
dataset_add = SC,
derivation = derive_vars_merged,
variable_params = list(
params(
filter_add = SCTESTCD == "PTEDUCAT" & SCBLFL == "Y",
new_vars = exprs(EDUC = SCSTRESN)
),
params(
filter_add = SCTESTCD == "PTMARRY" & SCBLFL == "Y",
new_vars = exprs(MARISTAT = SCORRES)
)
),
order = exprs(VISITNUM),
mode = "first",
by_vars = exprs(STUDYID, USUBJID),
check_type = "error",
relationship = "one-to-one"
) %>%
# Add baseline BMI derived from weight and height
call_derivation(
dataset = .,
dataset_add = VS,
derivation = derive_vars_merged,
variable_params = list(
params(
filter_add = VSTESTCD == "WEIGHT" & VSBLFL == "Y",
new_vars = exprs(WEIGHT = VSSTRESN)
),
params(
filter_add = VSTESTCD == "HEIGHT" & VSBLFL == "Y",
new_vars = exprs(HEIGHT = VSSTRESN)
)
),
order = exprs(VISITNUM),
mode = "first",
by_vars = exprs(USUBJID),
check_type = "error",
relationship = "one-to-one"
) %>%
mutate(
BMI = round(compute_bmi(height = HEIGHT, weight = WEIGHT), 2),
BMIBLU = ifelse(!is.na(BMI), "kg/m^2", NA_character_),
AGEU = ifelse(!is.na(AGEU), "Years", NA_character_)
)
# Add baseline questionnaire/assessment score
params_list <- lapply(unique(QS$QSTESTCD), function(param) {
params_cond <- admiral::params(
filter_add = QSTESTCD == !!param & QSBLFL == "Y",
new_vars = exprs(!!!exprs(param = QSORRES))
)
params_cond$new_vars <- substitute(
exprs(param := as.numeric(QSORRES)),
list(param = as.name(param))
)
return(params_cond)
})
ADSL <- ADSL %>%
call_derivation(
dataset = .,
dataset_add = QS,
derivation = derive_vars_merged,
variable_params = params_list,
mode = "first",
by_vars = exprs(USUBJID),
check_type = "error",
relationship = "one-to-one"
)
ADSL <- ADSL %>%
# Create enrollment flag
derive_var_merged_ef_msrc(
dataset = .,
by_vars = exprs(STUDYID, USUBJID),
flag_events = list(
flag_event(dataset_name = "DM", condition = !is.na(RFSTDTC))
),
source_datasets = list(DM = DM),
new_var = ENRLFL,
true_value = "Y",
false_value = NA_character_
) %>%
# Imputation for death date and enrollment end date
# Required partial date/time format/character format
rename("EOSDTHC" = EOSDT) %>%
select(-DTHDT) %>%
mutate(
ENRLDT = as.Date(ENRLDT),
EOSDTHC = as.character(EOSDTHC),
DTHDTC = as.character(DTHDTC)
) %>%
call_derivation(
dataset = .,
derivation = derive_vars_dtm,
variable_params = list(
params(dtc = DTHDTC, new_vars_prefix = "DTH"),
params(dtc = EOSDTHC, new_vars_prefix = "EOS")
),
highest_imputation = "M",
date_imputation = "mid",
flag_imputation = "auto",
min_dates = exprs(ENRLDT)
) %>%
derive_vars_dtm_to_dt(
dataset = .,
source_vars = exprs(DTHDTM, EOSDTM)
) %>%
# Remove any columns that are not specified in the meta-specs
drop_unspec_vars(
dataset = .,
metacore = METACORES,
dataset_name = "ADSL"
) %>%
# Add variable labels
metatools::set_variable_labels(
data = .,
metacore = METACORES,
dataset_name = "ADSL"
)
ADSL <- ADSL %>%
# Check all variables specified are present and no more
check_variables(
data = .,
metacore = METACORES,
dataset_name = "ADSL"
) %>%
# Checks all variables with CT only contain values within the CT
check_ct_data(
data = .,
metacore = METACORES %>%
select_dataset("ADSL"),
na_acceptable = TRUE,
omit_vars = c("ENRLFL", "DTHFL")
) %>%
# Orders the columns according to the spec
order_cols(
data = .,
metacore = METACORES,
dataset_name = "ADSL"
) %>%
# Sort the dataset based on key columns
sort_by_key(
data = .,
metacore = METACORES,
dataset_name = "ADSL"
) %>%
# Check uniqueness of record by key columns
check_unique_keys(
data = .,
metacore = METACORES,
datase = "ADSL"
)
ADSL <- ADSL %>%
xportr_metadata(
.df = .,
metadata = METACORES %>%
select_dataset("ADSL"),
domain = "ADSL",
verbose = "stop"
) %>%
# Check variable type with specs match
# xportr_type() %>%
# Assign variable value length from the meta specs
xportr_length() %>%
# Assign variable label from meta specs and
# checks variable label length for max of 40 characters
xportr_label() %>%
# Checks variable format
xportr_format()
Analysis Dataset of Adverse Events - ADAE
ADAE
dataset
contains one record per adverse events per subject with the following
characteristics.
adae_metacore <- METACORES %>%
select_dataset(.data = ., dataset = "ADAE", simplify = FALSE)
# Modified user-defined function
ADAE <- build_from_derived(
metacore = adae_metacore,
dataset_name = "ADAE",
ds_list = list("ADSL" = ADSL, "AE" = AE),
predecessor_only = TRUE,
keep = FALSE
) %>%
# Only subject that had at least one adverse events experience
filter(USUBJID %in% c(AE$USUBJID)) %>%
verify(nrow(.) == nrow(AE)) %>%
# Add adverse events onset and ended date
derive_vars_merged(
dataset = .,
dataset_add = AE %>%
select(USUBJID, AETERM, AESEQ, AESTDTC, AEENDTC),
by_vars = exprs(USUBJID, AETERM, AESEQ),
new_vars = NULL,
check_type = "error",
relationship = "one-to-one"
) %>%
# AESTDTC and AEENDTC required to be a character variable
mutate(
ENRLDT = as.Date(ENRLDT),
AESTDTC = as.character(AESTDTC),
AEENDTC = as.character(AEENDTC)
) %>%
# Add imputed date death date from ADSL
derive_vars_merged(
dataset = .,
dataset_add = ADSL %>%
select(USUBJID, STUDYID, DTHDT),
by_vars = exprs(USUBJID, STUDYID),
new_vars = NULL,
check_type = "error",
relationship = "many-to-one"
) %>%
# Derive analysis start and end date
call_derivation(
dataset = .,
derivation = derive_vars_dtm,
variable_params = list(
params(
dtc = AESTDTC, new_vars_prefix = "AST", highest_imputation = "M",
date_imputation = "first", time_imputation = "first"
),
params(
dtc = AEENDTC, new_vars_prefix = "AEN", highest_imputation = "M",
date_imputation = "last", time_imputation = "last",
max_dates = exprs(DTHDT)
)
),
flag_imputation = "auto",
min_dates = exprs(ENRLDT)
) %>%
# Convert into date format
derive_vars_dtm_to_dt(
dataset = .,
source_vars = exprs(ASTDTM, AENDTM)
) %>%
# Derive analysis start/end relative day and
derive_vars_dy(
reference_date = ENRLDT,
source_vars = exprs(ASTDT, AENDT)
) %>%
# Derive analysis duration (value and unit)
derive_vars_duration(
new_var = ADURN,
new_var_unit = ADURU,
start_date = ASTDT,
end_date = AENDT,
in_unit = "days",
out_unit = "days",
add_one = TRUE,
trunc_out = FALSE
) %>%
drop_unspec_vars(
dataset = .,
metacore = adae_metacore
) %>%
metatools::set_variable_labels(
data = .,
metacore = adae_metacore
)
ADAE <- ADAE %>%
check_variables(
data = .,
metacore = adae_metacore
) %>%
# check_ct_data(
# data = .,
# metacore = adae_metacore,
# na_acceptable = TRUE
# ) %>%
order_cols(
data = .,
metacore = adae_metacore
) %>%
sort_by_key(
data = .,
metacore = adae_metacore
) %>%
check_unique_keys(
data = .,
metacore = adae_metacore
) %>%
xportr_metadata(
.df = .,
metadata = adae_metacore,
verbose = "stop"
) %>%
# xportr_type() %>%
xportr_length() %>%
xportr_label() %>%
xportr_format()
Analysis Dataset of Questionnaires - ADQS
ADQS
dataset
contains one record per questionnaire parameter per visit per subject
with the following characteristics.
adqs_metacore <- METACORES %>%
select_dataset(dataset = "ADQS", simplify = FALSE)
# Assumed all questionnaire parameters are continuous (numeric)
ADQS <- single_build_from_derived(
metacore = adqs_metacore,
dataset_name = "ADQS",
ds_list = list("QS" = QS),
predecessor_only = TRUE,
keep = TRUE
) %>%
# Add variables from ADSL dataset
derive_vars_merged(
dataset = .,
dataset_add = single_build_from_derived(
metacore = adqs_metacore,
dataset_name = "ADQS",
ds_list = list("ADSL" = ADSL),
predecessor_only = TRUE,
keep = TRUE
),
by_vars = exprs(USUBJID),
new_vars = NULL,
check_type = "error",
relationship = "many-to-one"
) %>%
verify(nrow(.) == nrow(QS)) %>%
mutate(ADT = as.Date(ADT)) %>%
# Re-code PARAMN
# Assumed all PARAMCD are coded in the meta-specs
create_var_from_codelist(
data = .,
metacore = adqs_metacore,
input_var = PARAMCD,
out_var = PARAMN,
decode_to_code = TRUE
) %>%
# Add analysis timing variable
derive_vars_dy(
reference_date = ENRLDT,
source_vars = exprs(ADT)
) %>%
mutate(
AVISIT = case_when(
ADT <= ENRLDT | !is.na(ABLFL) ~ "BASELINE",
TRUE ~ VISIT
),
AVISITN = case_when(
ADT <= ENRLDT | !is.na(ABLFL) ~ 0,
TRUE ~ ADY
)
) %>%
# # Flag baseline records if the flag is not presented in QS derived dataset
# restrict_derivation(
# derivation = derive_var_extreme_flag,
# args = params(
# by_vars = exprs(STUDYID, USUBJID, PARAMCD),
# order = exprs(ADT),
# new_var = ABLFL,
# mode = "last"
# ),
# filter = !is.na(AVAL) & ADT <= ENRLDT
# ) %>%
# Derive baseline and change from baseline variables
# Only applicable for continuous parameters
derive_var_base(
by_vars = exprs(STUDYID, USUBJID, PARAMCD),
source_var = AVAL,
new_var = BASE
) %>%
# Derive change for post-baseline records
restrict_derivation(
derivation = derive_var_chg,
filter = AVISITN > 0
) %>%
# Derive percentage change for post-baseline records
restrict_derivation(
derivation = derive_var_pchg,
filter = AVISITN > 0
) %>%
# Derive sequence number
derive_var_obs_number(
by_vars = exprs(STUDYID, USUBJID, PARAMCD),
order = exprs(PARAMCD, ADT),
new_var = ASEQ,
check_type = "none" # "error"
) %>%
drop_unspec_vars(
dataset = .,
metacore = adqs_metacore
) %>%
metatools::set_variable_labels(
data = .,
metacore = adqs_metacore
)
ADQS <- ADQS %>%
check_variables(
data = .,
metacore = adqs_metacore
) %>%
check_ct_data(
data = .,
metacore = adqs_metacore,
na_acceptable = TRUE,
omit_vars = c("ENRLFL", "ABLFL", "PARAMCD")
) %>%
order_cols(
data = .,
metacore = adqs_metacore
) %>%
sort_by_key(
data = .,
metacore = adqs_metacore
) %>%
# check_unique_keys(
# data = .,
# metacore = adqs_metacore
# ) %>%
xportr_metadata(
.df = .,
metadata = adqs_metacore,
verbose = "stop"
) %>%
# xportr_type() %>%
xportr_length() %>%
xportr_label()
Analysis Dataset of ADAS-Cog Behaviour - ADADAS
Suppose the ADAS Cognitive Behaviors item-13 total score
(ADASTT13
) was the primary outcome in the analysis, then
ADAS-Cog Behavior Analysis Dataset (ADADAS
) can be created
either directly from ADQS
or with a
dataset-specific metadata-specs (i.e. adding here). The following chuck
illustrate how the ADADAS
can be created directly from
pre-generated ADQS
.
More information is available in the Longitudinal Clinical
Cognitive Outcome Summaries article.
Response Analysis Dataset (ADRS)
ADRS
dataset
contains subject’s clinical diagnostics summary per visits including
death records.
adrs_metacore <- METACORES %>%
select_dataset(dataset = "ADRS", simplify = FALSE)
# Functions to convert character values into numeric
create_rs_aval <- function(x) {
x <- dplyr::case_when(
x %in% "CN" ~ 1,
x %in% "MCI" ~ 2,
x %in% "DEM" ~ 3,
x %in% "DEATH" ~ 4
)
return(x)
}
# Merge ADSL to RS
adsl_vars <- exprs(STUDYID, USUBJID, ENRLFL, ENRLDT, DTHFL)
ADRS_PREP <- RS %>%
filter(RSTESTCD %in% "DX") %>%
# Adjusting BLFL for subjects that have more than one flag
group_by(STUDYID, USUBJID, RSTESTCD) %>%
mutate(NUM_BLFL = sum(!is.na(RSBLFL))) %>%
ungroup() %>%
mutate(RSBLFL = case_when(
NUM_BLFL > 1 & VISITNUM != 1 ~ NA_character_,
TRUE ~ RSBLFL
)) %>%
select(-NUM_BLFL) %>%
mutate(
PARAMCD = RSTESTCD,
PARAM = RSTEST,
AVALC = RSSTRESC,
AVAL = create_rs_aval(RSSTRESC),
ABLFL = RSBLFL,
ADT = as.Date(RSDTC),
ADY = RSDY,
COLPROT = RSGRPID
) %>%
derive_vars_merged(
dataset = .,
dataset_add = ADSL,
new_vars = adsl_vars,
by_vars = exprs(STUDYID, USUBJID)
) %>%
mutate(
AVISIT = case_when(
ADT <= ENRLDT & !is.na(ABLFL) ~ "BASELINE",
TRUE ~ VISIT
),
AVISITN = case_when(
ADT <= ENRLDT & !is.na(ABLFL) ~ 0,
TRUE ~ ADY
)
) %>%
derive_var_base(
dataset = .,
by_vars = exprs(STUDYID, USUBJID, PARAMCD),
source_var = AVALC
) %>%
select(
STUDYID, USUBJID, RSSEQ, COLPROT, PARAMCD, PARAM, AVISIT, AVISITN,
AVALC, AVAL, ABLFL, BASE, ADT, ADY, VISITNUM, VISIT, EPOCH, ENRLFL, DTHFL
)
# Flag subjects that have baseline diagnostics summary and
## populate the flag per subject's records
ADRS_PREP <- ADRS_PREP %>%
call_derivation(
dataset = .,
dataset_add = .,
derivation = derive_vars_merged,
variable_params = list(
params(
filter_add = ABLFL == "Y",
new_vars = exprs(BLBFL = ABLFL)
)
),
by_vars = exprs(STUDYID, USUBJID),
check_type = "error",
relationship = "many-to-one"
)
# Derive death parameter ----
DAETH_PARAM <- ADSL %>%
select(!!!adsl_vars, DTHDT) %>%
# Since month imputations was performed,
# floor the date to the last date of a year
mutate(
DTHDT = ceiling_date(DTHDT, unit = "year") - days(1),
DHDY = as.numeric(DTHDT - ENRLDT),
DHDY = ifelse(DHDY == 0, 1, DHDY)
) %>%
derive_var_merged_ef_msrc(
dataset = .,
by_vars = exprs(STUDYID, USUBJID),
flag_events = list(
flag_event(dataset_name = "ADRS_PREP", condition = BLBFL == "Y")
),
source_datasets = list(ADRS_PREP = ADRS_PREP),
new_var = BLBFL,
true_value = "Y",
)
# Add DEATH parameter ----
ADRS_PREP <- ADRS_PREP %>%
# Add DEATH parameter
derive_extreme_records(
dataset = .,
dataset_add = DAETH_PARAM,
dataset_ref = DAETH_PARAM,
by_vars = exprs(STUDYID, USUBJID),
filter_add = !is.na(DTHDT),
check_type = "error",
exist_flag = AVALC,
true_value = "DEATH",
false_value = NA_character_,
set_values_to = exprs(
PARAMCD = "DEATH",
PARAM = "Death",
AVAL = ifelse(!is.na(DTHDT), 4, NA_real_),
ADT = DTHDT,
ADY = DHDY,
AVISITN = DHDY
)
) %>%
select(-DTHDT, -DHDY) %>%
# Remove missing DEATH paramters
filter(!(PARAMCD %in% "DEATH" & is.na(DTHFL))) %>%
mutate(PARAMN = case_when(
PARAMCD == "DX" ~ 1,
PARAMCD == "DEATH" ~ 2
))
# Flag subjects that have at least one followup diagnostics summary after baseline visit
# Or had death records
ADRS_PREP <- ADRS_PREP %>%
derive_var_merged_exist_flag(
dataset = .,
dataset_add = .,
by_vars = exprs(STUDYID, USUBJID),
filter_add = !ABLFL %in% "Y" & !is.na(AVALC) & (!EPOCH %in% "Screening" | AVISITN < 0),
condition = BLBFL %in% "Y" | is.na(BLBFL),
new_var = FOLLOWPFL,
false_value = "N",
missing_value = "M"
)
# Identify records that will be used for the analysis: -----
# Enrolled subjects (ENRLFL: "Y"),
# Have a baseline diagnostics summary (BLBFL: "Y"), and
# have at least one followup diagnostics summary/death records (FOLLOWPFL: "Y")
ADRS_PREP <- ADRS_PREP %>%
mutate(
ANL01FL = case_when(
ENRLFL == "Y" & BLBFL == "Y" & FOLLOWPFL == "Y" & !EPOCH %in% "Screening" ~ "Y",
ENRLFL == "Y" & BLBFL == "Y" & FOLLOWPFL == "Y" & EPOCH %in% "Screening" & ABLFL %in% "Y" ~ "Y"
)
) %>%
select(-ENRLFL, -ENRLDT, -DTHFL)
ADRS <- ADRS_PREP %>%
# Merge ADSL ADSL dataset
derive_vars_merged(
dataset = .,
dataset_add = single_build_from_derived(
metacore = adrs_metacore,
dataset_name = "ADRS",
ds_list = list("ADSL" = ADSL),
predecessor_only = TRUE,
keep = TRUE
),
by_vars = exprs(STUDYID, USUBJID),
check_type = "error",
relationship = "many-to-one"
) %>%
drop_unspec_vars(
dataset = .,
metacore = adrs_metacore
) %>%
metatools::set_variable_labels(
data = .,
metacore = adrs_metacore
)
ADRS <- ADRS %>%
check_variables(
data = .,
metacore = adrs_metacore
) %>%
check_ct_data(
data = .,
metacore = adrs_metacore,
na_acceptable = TRUE,
omit_vars = c("ENRLFL", "ABLFL", "PARAMCD")
) %>%
order_cols(
data = .,
metacore = adrs_metacore
) %>%
sort_by_key(
data = .,
metacore = adrs_metacore
) %>%
xportr_metadata(
.df = .,
metadata = adrs_metacore,
verbose = "stop"
) %>%
# xportr_type() %>%
xportr_length() %>%
xportr_label()