# CARS Procedure Vignette (Simulation Setting 1)

#### 2019-01-19

This vignette illustrates the basic usage of CARS. For the sake of simplicity, we will consider the Simulation Setting 1 in the referenced paper.

# global parameters
m <- 5000; # number of tests
n_x <- 50; # number of samples for each location in x
n_y <- 60; # number of samples for each location in y
Nrep <- 50; # number of replications
alpha <- 0.05; # nominal level fdr

# next we generate the data matrices for x and y
# for illustrative purposes, we choose k=500
k <- 500;
mu_x_1 <- 5/sqrt(30); # signal strength for first 500 locations of x
mu_y_1 <- 3/sqrt(30); # signal strength for first 500 locations of y
mu_x_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of x
mu_y_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of y
sd_x <- 1; # standard deviation for all locations of x
sd_y <- sqrt(2); # standard deviation for all locations of y
theta <- rep(0,m); # the vector for true states of nature
theta[1:k] <- 1; # signal locations: first 500
# initialize the vector of false discovery proportions
FDP <- rep(0,Nrep);
# initialize the vector of missed discovery proportions
MDP <- rep(0,Nrep);

Next, we iterate the CARS procedure and calculate the corresponding FDR and MDR level:

library(CARS)
library(np)
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-8)
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
for (i in 1:Nrep){
# initialize matrices for x and y
x <- matrix(rnorm(m*n_x,0,sd_x),m,n_x);
y <- matrix(rnorm(m*n_y,0,sd_y),m,n_y);

# fill in data matrices according to Setting 1 (k=500)
x[c(1:k),] <- x[c(1:k),] + mu_x_1;
y[c(1:k),] <- y[c(1:k),] + mu_y_1;
x[c((k+1):(2*k)),] <- x[c((k+1):(2*k)),] + mu_x_2;
y[c((k+1):(2*k)),] <- y[c((k+1):(2*k)),] + mu_y_2;

# CARS procedure at level alpha
decision <- CARS(x,y,alpha,option='regular')$de; # false discovery proportion FDP[i] <- sum((1-theta)*decision)/max(sum(decision),1); # missed discovery proportion MDP[i] <- sum(theta*(1-decision))/max(sum(theta),1); } #finally we obtain the FDR and MDR levels by averaging the proportions over Nrep=50 replications FDR <- mean(FDP); MDR <- mean(MDP); FDR; ##  0.0523021 MDR; ##  0.77688 Notice the FDR is well controlled under nominal level 0.05. For the sparse case, we inrease the dimension m and reduce the number of signals k (other parameters are same as above): # global parameters m <- 10000; # number of tests n_x <- 50; # number of samples for each location in x n_y <- 60; # number of samples for each location in y Nrep <- 50; # number of replications alpha <- 0.05; # nominal level fdr k <- 10; mu_x_1 <- 5/sqrt(30); # signal strength for first 500 locations of x mu_y_1 <- 3/sqrt(30); # signal strength for first 500 locations of y mu_x_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of x mu_y_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of y sd_x <- 1; # standard deviation for all locations of x sd_y <- sqrt(2); # standard deviation for all locations of y theta <- rep(0,m); # the vector for true states of nature theta[1:k] <- 1; # signal locations: first 500 # initialize the vector of false discovery proportions FDP <- rep(0,Nrep); # initialize the vector of missed discovery proportions MDP <- rep(0,Nrep); Next, we iterate the CARS procedure and calculate the corresponding FDR and MDR level using the sparse option: library(CARS) library(np) for (i in 1:Nrep){ # initialize matrices for x and y x <- matrix(rnorm(m*n_x,0,sd_x),m,n_x); y <- matrix(rnorm(m*n_y,0,sd_y),m,n_y); # fill in data matrices according to Setting 1 (k=500) x[c(1:k),] <- x[c(1:k),] + mu_x_1; y[c(1:k),] <- y[c(1:k),] + mu_y_1; x[c((k+1):(2*k)),] <- x[c((k+1):(2*k)),] + mu_x_2; y[c((k+1):(2*k)),] <- y[c((k+1):(2*k)),] + mu_y_2; # CARS procedure at level alpha decision <- CARS(x,y,alpha,option='sparse')$de;

# false discovery proportion
FDP[i] <- sum((1-theta)*decision)/max(sum(decision),1);

# missed discovery proportion
MDP[i] <- sum(theta*(1-decision))/max(sum(theta),1);
}

#finally we obtain the FDR and MDR levels by averaging the proportions over Nrep=50 replications

FDR <- mean(FDP);
MDR <- mean(MDP);
FDR;
##  0
MDR;
##  0.982