## ###################################################################################
## Author: Oleg Sofrygin
## Maintainer: Oleg Sofrygin (Oleg.Sofrygin@kp.org)

## Copyright 2016 Oleg Sofrygin

## This R program is free software: you can redistribute it and/or modify it under the terms of 
## the GNU General Public License as published by the Free Software Foundation, either version 3
## of the License, or (at your option) any later version.

## This R program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
## without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
## See the GNU General Public License for  more details.
 
## See <http://www.gnu.org/licenses/> for a copy of the GNU General Public License.  
## ###################################################################################


# DO NOT EDIT THIS LINE:
## @knitr long_to_wide_macro

###################################################################################
### convert_long_to_wide.R
# This macro does several things:

# 1) Takes data in long format (dat) and returns a list of two dataframes: dat_wide, regim_wide
#   *)  dat_wide: wide format dataframe, structured as: (ID_var, W, L(t), C(t), A(t), newY(t))
#   *)  regim_wide: wide format regimens dataframe, structured as: (ID_var, regimens)
# 2) Removes all observations with follow-up over tint_max-1
#   *)  only keeps observations which satisfy: dat[,t_var] < tint_max
# 3) dat_wide gets new outcome, which is defined by shifting oldY outcome backward in time by 1
#   *) new outcome gets named according to newY argument
#   *) newY(t)=oldY(t+1) for all t=0,...,t_max
# 4) Once the outcome is observed, newY(t)=1, it is carried forward until the end of FUP
# NOT IMPLEMENTED YET:
# 5) Once censoring is observed, C(t)=1, all variables to the right are set to NA (except for outcome)
# 6) Converts all censoring variables (C) to factors with levels "censored" / "uncensored"
#   *) input data should be coded as C=1 for censoring events
###################################################################################
### arguments:
#
# dat - original data frame in long format (repeated measures per observation)
# ID_var - unique patient identifier
# t_var - follow-up time variable
# tint_max - max number of time intervals to be kept in output data
# W - vector of baseline covariate names
# L - vector of time-varying covariate names
# C - vector of censoring names
# A - treatment name
# oldY - name of the outcome variable in input data
# newY - name for the redefined outcome in new data (can be the same as oldY)
# regimens - vector of variable names defining regimens in input data
###################################################################################

f_Long_to_Wide <- function(dat, ID_var, t_var, tint_max, W, L, C, A, oldY, newY, regimens) {
    #--------------------------------------------------------------------------------
    # TIME ORDERING STRUCTURE:
    #--------------------------------------------------------------------------------
    finTD_vars <- c(L, C, A, newY)  # TIME ORDERING OF TIME-DEP VARS
    keep_vars <- c(ID_var,t_var,W,finTD_vars) # LIST OF VARS THAT WILL MAKE IT INTO FINAL DATASET
    #--------------------------------------------------------------------------------
    # shift Y with Y_tm1(t) = Y(t+1)
    #--------------------------------------------------------------------------------
    Y_tm1 <- by(dat[,c(ID_var,t_var,Y)], dat[,ID_var], function(sub_dat) {
                      sub_dat[nrow(sub_dat)-1,Y] <- sub_dat[nrow(sub_dat),Y]
                      return(t(sub_dat[-nrow(sub_dat),]))
                      })
    Y_tm1_mtx <- matrix(unlist(Y_tm1), ncol = 3, byrow = TRUE)
    Y_tm1_df <- data.frame(Y_tm1_mtx[,1], 
                          as.integer(Y_tm1_mtx[,2]), 
                          as.integer(Y_tm1_mtx[,3]))
    names(Y_tm1_df) <- c(ID_var, t_var, newY)
    #--------------------------------------------------------------------------------
    # remove extra covars, remove last row for all obs and keep new def of Y:
    #--------------------------------------------------------------------------------
    # add new Y def removing last row for each patient
    dat_newY <- merge(dat, Y_tm1_df, by=c(ID_var,t_var), all=F, sort=F)
    #--------------------------------------------------------------------------------
    # Set all censoring, treatment and Y nodes to NA after censoring event
    #--------------------------------------------------------------------------------
    dat_newY[dat_newY[, C[1]]%in%1, c(C[2:3],A,newY)] <- NA
    # print(head(dat_newY[dat_newY[, C[1]]%in%1, c(C,A,newY)],100))
    dat_newY[dat_newY[, C[2]]%in%1, c(C[3],A,newY)] <- NA
    # print(head(dat_newY[dat_newY[, C[2]]%in%1, c(C,A,newY)],100))
    dat_newY[dat_newY[, C[3]]%in%1, c(A,newY)] <- NA
    # print(head(dat_newY[dat_newY[, C[3]]%in%1, c(C,A,newY)],100))
    #--------------------------------------------------------------------------------
    # remove obs that appear at or after tmax (keeps t=[0,...,tmax-1])
    # keep final var names
    # convert to wide fmt
    #--------------------------------------------------------------------------------
    dat_tmax <- dat_newY[(dat_newY[,t_var] < tint_max),names(dat_newY)%in%keep_vars]
    dat_wide <- reshape(data=dat_tmax, v.names=finTD_vars, timevar=t_var, idvar=ID_var, direction='wide', sep="_")
    # save regimen flags separately (also convert to wide fmt)
    dat_tmax_regim <- dat_newY[(dat_newY[,t_var]<tint_max),names(dat_newY)%in%c(ID_var,t_var,regimens)]
    regim_wide <- reshape(data=dat_tmax_regim, v.names=regimens, timevar=t_var, idvar=ID_var, direction='wide', sep="_")
    #--------------------------------------------------------------------------------
    # Some attributes on wide format data
    #--------------------------------------------------------------------------------
    times <- attributes(dat_wide)$reshapeWide$times  # vector of times
    resTD_vars <- attributes(dat_wide)$reshapeWide$v.names # time-dep vars before rename
    varst_mtx <- attributes(dat_wide)$reshapeWide$varying  # matrix of time by timevar names after rename (row=vars, col=time)
    #--------------------------------------------------------------------------------
    # Carry forward Y if Y[t_i]=1 for some t_i
    #--------------------------------------------------------------------------------
    Ynodes <- varst_mtx[which(resTD_vars%in%newY),]
    Y_vals <- dat_wide[,Ynodes]
    t_max <- ncol(Y_vals)
    Y1fill <- apply(Y_vals, 1, function(Yrow) {
                              ind_1 <- which(Yrow%in%1)
                              if (length(ind_1)>0) {
                                if (ind_1<t_max) Yrow[(ind_1+1):t_max] <- rep(1, t_max-ind_1)
                              }
                              return(t(Yrow))
              })
    Y1fill <- t(Y1fill)
    colnames(Y1fill) <- colnames(Y_vals)
    dat_wide[,Ynodes] <- Y1fill
    #--------------------------------------------------------------------------------
    # A) Recode censoring vars with correct levels ('censored' / 'uncensored')
    #--------------------------------------------------------------------------------
    # library(ltmle)
    # print("...using BinaryToCensoring() to recode censoring vars ...")
    Cnodes <- unlist(apply(varst_mtx[which(resTD_vars%in%C),],2,list))
    for (Cvar in Cnodes) {
      dat_wide[,Cvar] <- BinaryToCensoring(is.censored=dat_wide[,Cvar])
    }
    #--------------------------------------------------------------------------------
    # B) Recode all censoring nodes C as integers with 1=uncensored; 0=censored
    # this is the format used inside ltmle()
    #--------------------------------------------------------------------------------
    # Cnodes <- unlist(apply(varst_mtx[which(resTD_vars%in%C),],2,list))
    # for (Cvar in Cnodes) {
    #   dat_wide[,Cvar] <- 1-dat_wide[,Cvar]
    # }
    return(list(dat_wide=dat_wide, regim_wide=regim_wide))
}

# Helper function to generate censoring as factors
BinaryToCensoring <- function(is.censored, is.uncensored) {
  if (! xor(missing(is.censored), missing(is.uncensored))) stop("exactly one of is.censored and is.uncensored must be passed")
  calling.name <- names(sys.call(0))[2]
  if (length(calling.name) == 0 || ! calling.name %in% c("is.censored", "is.uncensored")) stop("the argument to BinaryToCensoring must be completely named - see ?BinaryToCensoring")
  if (missing(is.uncensored)) {
    is.uncensored <- ! is.censored
  }
  if (! all(is.uncensored %in% c(0, 1, NA))) stop("the argument to BinaryToCensoring should be binary (0, 1, or NA) or logical")
  y <- character(length(is.uncensored))
  y[is.uncensored == 0] <- "censored"
  y[is.uncensored == 1] <- "uncensored"
  y[is.na(is.uncensored)] <- NA
  return(factor(y))
}