# Set seed for reproducibility
set.seed(1707)
# Define study parameters
n_participants <- 100
n_days <- 14
n_signals <- 5
# Simulation function
simulate_ema_data <- function(missing = TRUE) { # to be able to generate a df with/without missingness later
ema_data <- expand.grid(
PID = 1:n_participants,
study.day = 1:n_days,
EMA.signal = 1:n_signals
)
# Sort data because I'm neurotic
ema_data <- ema_data %>%
arrange(PID, study.day, EMA.signal)
# Add person-level random effects
person_effects <- data.frame(
PID = 1:n_participants,
mood_effect = rnorm(n_participants, 0, 0.5),
alc_use_effect = rnorm(n_participants, 0, 0.5)
)
ema_data <- ema_data %>%
left_join(person_effects, by = "PID")
# Initialize some variables for later
ema_data$mood <- NA
ema_data$alc.use <- NA
ema_data$alc.intox <- NA
# Generate data
for (pid in 1:n_participants) {
for (day in 1:n_days) {
idx <- which(ema_data$PID == pid & ema_data$study.day == day)
base_mood <- rnorm(1, 0, 1) + ema_data$mood_effect[idx[1]]
for (signal in 1:n_signals) {
curr_idx <- idx[ema_data$EMA.signal[idx] == signal]
if (signal == 1) {
ema_data$mood[curr_idx] <- base_mood + rnorm(1, 0, 0.3)
} else {
prev_mood <- ema_data$mood[idx[ema_data$EMA.signal[idx] == (signal-1)]]
ema_data$mood[curr_idx] <- prev_mood * 0.7 + rnorm(1, 0, 0.5)
}
# Only generate alcohol use for signal 1
if (signal == 1) {
# Alcohol use depends on mood (lower mood -> higher probability of drinking)
logit_p <- -1 + (-0.34 * ema_data$mood[curr_idx]) + ema_data$alc_use_effect[curr_idx]
prob <- exp(logit_p) / (1 + exp(logit_p))
ema_data$alc.use[curr_idx] <- rbinom(1, 1, prob)
}
# Only generate alcohol intoxication for signal 5
if (signal == 5) {
curr_alc_use <- ema_data$alc.use[idx[ema_data$EMA.signal[idx] == 1]]
if (!is.na(curr_alc_use) && curr_alc_use == 1) {
# If they drank, generate intoxication level
ema_data$alc.intox[curr_idx] <- rexp(1, rate = 1/2) + 1
} else {
# If they didn't drink, intoxication is 0
ema_data$alc.intox[curr_idx] <- 0
}
}
}
}
}
# Create lagged intoxication variable for use in missingness pattern
ema_data <- ema_data %>%
group_by(PID) %>%
mutate(prev_day_intox = lag(alc.intox, n = n_signals)) %>%
ungroup()
# Apply missingness if specified
if (missing) {
for (i in 1:nrow(ema_data)) {
# Only apply missingness to observations where alcohol use is asked (signal 1)
if (ema_data$EMA.signal[i] == 1) {
miss_prob <- 0.25
# Increase probability based on previous day's intoxication (if available)
prev_intox <- ema_data$prev_day_intox[i]
if (!is.na(prev_intox)) {
# Increase miss probability as previous day's intoxication increases
miss_prob <- miss_prob + (0.2 * prev_intox)
}
# Increase probability based on current mood AND current alcohol use
curr_mood <- ema_data$mood[i]
curr_alc_use <- ema_data$alc.use[i]
if (!is.na(curr_mood)) {
# Decrease miss probability as mood increases (negative relationship)
miss_prob <- miss_prob - (0.10 * curr_mood)
}
# Much higher missingness when they actually drank
if (!is.na(curr_alc_use) && curr_alc_use == 1) {
miss_prob <- miss_prob + 0.25 # need this to get it to work, it's awkward
}
# Ensure probability is between 0 and 1
miss_prob <- pmax(0, pmin(0.95, miss_prob))
# Apply missingness
if (runif(1) < miss_prob) {
ema_data$alc.use[i] <- NA
}
}
}
}
# Return the dataset
return(ema_data)
}
# Simulate two datasets (one with missingness, one without)
ema_data_missing <- simulate_ema_data(missing = TRUE)
ema_data_complete <- simulate_ema_data(missing = FALSE)
# Calculate missingness percentage in alcohol use variable
missing_percentage <- ema_data_missing %>%
filter(EMA.signal == 1) %>%
summarise(missing_percent = mean(is.na(alc.use)) * 100) %>%
pull(missing_percent)
# Print missingness percentage
cat("Percentage of missing data:", round(missing_percentage, 1), "%\n")
Percentage of missing data: 30.9 %