## ###################################################################################
## Author: Oleg Sofrygin
## Maintainer: Oleg Sofrygin (Oleg.Sofrygin@kp.org)
## Copyright 2016 Oleg Sofrygin
## This R program is free software: you can redistribute it and/or modify it under the terms of
## the GNU General Public License as published by the Free Software Foundation, either version 3
## of the License, or (at your option) any later version.
## This R program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
## without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
## See the GNU General Public License for more details.
## See for a copy of the GNU General Public License.
## ###################################################################################
# DO NOT EDIT THIS LINE:
## @knitr long_to_wide_macro
###################################################################################
### convert_long_to_wide.R
# This macro does several things:
# 1) Takes data in long format (dat) and returns a list of two dataframes: dat_wide, regim_wide
# *) dat_wide: wide format dataframe, structured as: (ID_var, W, L(t), C(t), A(t), newY(t))
# *) regim_wide: wide format regimens dataframe, structured as: (ID_var, regimens)
# 2) Removes all observations with follow-up over tint_max-1
# *) only keeps observations which satisfy: dat[,t_var] < tint_max
# 3) dat_wide gets new outcome, which is defined by shifting oldY outcome backward in time by 1
# *) new outcome gets named according to newY argument
# *) newY(t)=oldY(t+1) for all t=0,...,t_max
# 4) Once the outcome is observed, newY(t)=1, it is carried forward until the end of FUP
# NOT IMPLEMENTED YET:
# 5) Once censoring is observed, C(t)=1, all variables to the right are set to NA (except for outcome)
# 6) Converts all censoring variables (C) to factors with levels "censored" / "uncensored"
# *) input data should be coded as C=1 for censoring events
###################################################################################
### arguments:
#
# dat - original data frame in long format (repeated measures per observation)
# ID_var - unique patient identifier
# t_var - follow-up time variable
# tint_max - max number of time intervals to be kept in output data
# W - vector of baseline covariate names
# L - vector of time-varying covariate names
# C - vector of censoring names
# A - treatment name
# oldY - name of the outcome variable in input data
# newY - name for the redefined outcome in new data (can be the same as oldY)
# regimens - vector of variable names defining regimens in input data
###################################################################################
f_Long_to_Wide <- function(dat, ID_var, t_var, tint_max, W, L, C, A, oldY, newY, regimens) {
#--------------------------------------------------------------------------------
# TIME ORDERING STRUCTURE:
#--------------------------------------------------------------------------------
finTD_vars <- c(L, C, A, newY) # TIME ORDERING OF TIME-DEP VARS
keep_vars <- c(ID_var,t_var,W,finTD_vars) # LIST OF VARS THAT WILL MAKE IT INTO FINAL DATASET
#--------------------------------------------------------------------------------
# shift Y with Y_tm1(t) = Y(t+1)
#--------------------------------------------------------------------------------
Y_tm1 <- by(dat[,c(ID_var,t_var,Y)], dat[,ID_var], function(sub_dat) {
sub_dat[nrow(sub_dat)-1,Y] <- sub_dat[nrow(sub_dat),Y]
return(t(sub_dat[-nrow(sub_dat),]))
})
Y_tm1_mtx <- matrix(unlist(Y_tm1), ncol = 3, byrow = TRUE)
Y_tm1_df <- data.frame(Y_tm1_mtx[,1],
as.integer(Y_tm1_mtx[,2]),
as.integer(Y_tm1_mtx[,3]))
names(Y_tm1_df) <- c(ID_var, t_var, newY)
#--------------------------------------------------------------------------------
# remove extra covars, remove last row for all obs and keep new def of Y:
#--------------------------------------------------------------------------------
# add new Y def removing last row for each patient
dat_newY <- merge(dat, Y_tm1_df, by=c(ID_var,t_var), all=F, sort=F)
#--------------------------------------------------------------------------------
# Set all censoring, treatment and Y nodes to NA after censoring event
#--------------------------------------------------------------------------------
dat_newY[dat_newY[, C[1]]%in%1, c(C[2:3],A,newY)] <- NA
# print(head(dat_newY[dat_newY[, C[1]]%in%1, c(C,A,newY)],100))
dat_newY[dat_newY[, C[2]]%in%1, c(C[3],A,newY)] <- NA
# print(head(dat_newY[dat_newY[, C[2]]%in%1, c(C,A,newY)],100))
dat_newY[dat_newY[, C[3]]%in%1, c(A,newY)] <- NA
# print(head(dat_newY[dat_newY[, C[3]]%in%1, c(C,A,newY)],100))
#--------------------------------------------------------------------------------
# remove obs that appear at or after tmax (keeps t=[0,...,tmax-1])
# keep final var names
# convert to wide fmt
#--------------------------------------------------------------------------------
dat_tmax <- dat_newY[(dat_newY[,t_var] < tint_max),names(dat_newY)%in%keep_vars]
dat_wide <- reshape(data=dat_tmax, v.names=finTD_vars, timevar=t_var, idvar=ID_var, direction='wide', sep="_")
# save regimen flags separately (also convert to wide fmt)
dat_tmax_regim <- dat_newY[(dat_newY[,t_var]0) {
if (ind_1