Data Transformations

This Shiny App performs five different transformation techniques and explore the strength and weakness of each method compare to the truth model and each other. We include five transformation techniques to transform the data: Log transformation, Reciprocal transformation, Squared transformation, Square-root transformation, Box-Cox power transformation(this is a way to transform non-normal dependent variables into a normal shape). The truth model underlying the data is Y=Bx+e, where the independent variables X is sampled from normal distribution, B is 0.2 and error term e is drew from a skewed normal distribution with user adjusted parameters: sample size and alpha(control skewness).

Developed by:

Marc Weitz

Arthur Schwebke

Adam Finnemann

Shuo Sun

This tab shows the theoretical sampling distribution as well as the empirical sample (top-right). The sample size and skewness of the sample can be controlled by the 'alpha' and 'sample size' parameter. Scatter+histogram and qq plots allows for visual inspection of five transformations' effect.

Choose sample_size

Choose alpha

Theoretical and empirical distribution

Log transformation

Log transformation QQ-plot

Reciprocal transformation

Reciprocal transformation QQ-plot

Squared transformation

Squared transformation QQ-plot

Square-root transformation

Square-root transformation QQ-plot

Box transformation

Box transformation QQ-plot

The plot shows the MSE of a 10 fold cross-validation for each model. Each model is fitted on differently transformed data (log, reciprocal, squared, square-root, identity and boxcox

library(shiny) library(tidyverse) library(sn) library(ggpubr) library(shinydashboard) library(MASS) library(WVPlots) # setwd("~/TquanT/tquant2019-transformation") source("QQplot.R") source("scatter_with_marginal_density.R") source("data_generation.R") source("utils.R") source("plots.R") ui <- dashboardPage( dashboardHeader( title = "Data Transformations", titleWidth = "100%"), dashboardSidebar( sidebarMenu( menuItem("Introduction page", tabName = "theory", icon = icon("info-circle")), menuItem("Visualization", tabName = "turn", badgeLabel = "Try me!", badgeColor = "blue", icon = icon("pencil-square-o")), menuItem("Simulation tests", tabName = "sim", badgeLabel = "Watch me!", badgeColor = "blue", icon = icon("pencil-square-o")) ) ), dashboardBody( tabItems( tabItem(tabName = "theory", includeScript("../../../Matomo-tquant.js"), fluidRow( column(7, h3(img(src="https://gupsych.github.io/tquant/files/tquant_logo.png", width = "100", height = "100", align = "center")), withMathJax(), div(style = "font-size:125%", " This Shiny App performs five different transformation techniques and explore the strength and weakness of each method compare to the truth model and each other. We include five transformation techniques to transform the data: Log transformation, Reciprocal transformation, Squared transformation, Square-root transformation, Box-Cox power transformation(this is a way to transform non-normal dependent variables into a normal shape). The truth model underlying the data is Y=Bx+e, where the independent variables X is sampled from normal distribution, B is 0.2 and error term e is drew from a skewed normal distribution with user adjusted parameters: sample size and alpha(control skewness). ") ), column(5, box(title=h4("Developed by:"), solidHeader = TRUE, status = "info", width = '12', h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/tuebingen_logo.png?resize=56%2C100&ssl=1", width = "50", height = "55"), " Marc Weitz"), h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/amsterdam_logo.png?resize=100%2C100&ssl=1", width = "50"), " Arthur Schwebke"), h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/amsterdam_logo.png?resize=100%2C100&ssl=1", width = "50"), " Adam Finnemann"), h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/leuven_logo.png?resize=150%2C100&ssl=1", width="50",height="45"), " Shuo Sun") ) ) ) ), tabItem(tabName = "turn", fluidRow( column(6,div(style = "height:450px;background-color: lightblue;", "This tab shows the theoretical sampling distribution as well as the empirical sample (top-right). The sample size and skewness of the sample can be controlled by the 'alpha' and 'sample size' parameter. Scatter+histogram and qq plots allows for visual inspection of five transformations' effect.", # checkboxGroupInput("trans", # h3("Choose transformation"), # choices = list("Log transformation" = 1, # "Recirpocal transformation" = 2, # "Square-root transformation" = 3, # "sqaured transformation" = 4, # "Box-cox transformation" = 5), # selected = c(1,2,3,4,5)), sliderInput("sample_size", h3("Choose sample_size"), min = 1, max = 5000, value = 500), sliderInput("alpha", h3("Choose alpha"), min = 0, max = 150, value = 20) )), column(6,div(h3("Theoretical and empirical distribution"), plotOutput("sample_distri")) ) ), ### end first row ### fluidRow( #condition = "input.trans %in% 1", condition = "input.trans %in% '1'", column(6, h3("Log transformation"), plotOutput("log_plot") ), column(6, h3("Log transformation QQ-plot"), plotOutput("log_qq")) ), ### end second row ### fluidRow( column(6, h3("Reciprocal transformation"), plotOutput("reci_plot") ), column(6, h3("Reciprocal transformation QQ-plot"), plotOutput("reci_qq")) ), ### end third row ### fluidRow( column(6, h3("Squared transformation"), plotOutput("square_plot") ), column(6, h3("Squared transformation QQ-plot"), plotOutput("square_qq")) ), ### end fourth row ### fluidRow( column(6, h3("Square-root transformation"), plotOutput("sqrt_plot") ), column(6, h3("Square-root transformation QQ-plot"), plotOutput("sqrt_qq")) ), ### end fith row ### fluidRow( column(6, h3("Box transformation"), plotOutput("boxcox_plot") ), column(6, h3("Box transformation QQ-plot"), plotOutput("boxcox_qq")) ) ), tabItem(tabName = "sim", column(3, "The plot shows the MSE of a 10 fold cross-validation for each model. Each model is fitted on differently transformed data (log, reciprocal, squared, square-root, identity and boxcox" ), column(9, plotOutput("cv")) ) ) ) ) server <- function(input, output, session) { sample <- reactive({ print(input$trans) sample <- rsn(n = input$sample_size, xi= 0, omega = 20, alpha = input$alpha, tau=0, dp=NULL) df <- generate.data(sample) df <- cbind(df, transform_data(df$Y)) }) output$sample_distri <- renderPlot({ scatter_with_marginal_density(sample(),"X","Y") }) output$log_qq <- renderPlot({ QQplot(sample()$logx) }) output$log_plot <- renderPlot({ scatter_with_marginal_density(sample(),"X","logx") }) output$reci_qq <- renderPlot({ QQplot(sample()$recip) }) output$reci_plot <- renderPlot({ scatter_with_marginal_density(sample(),"X","recip") }) output$sqrt_qq <- renderPlot({ QQplot(sample()$sqrtx) }) output$sqrt_plot <- renderPlot({ scatter_with_marginal_density(sample(),"X","sqrtx") }) output$square_qq <- renderPlot({ QQplot(sample()$x2) }) output$square_plot <- renderPlot({ scatter_with_marginal_density(sample(),"X","x2") }) output$boxcox_qq <- renderPlot({ QQplot(sample()$box) }) output$boxcox_plot <- renderPlot({ scatter_with_marginal_density(sample(),"X","box") }) output$cv <-renderPlot({ #sample <- rsn(n = 50, xi= 0, omega = 20, alpha = 30, tau=0, dp=NULL) #df <- generate.data(sample) #df <- cbind(df, transform_data(df$Y)) df <- sample() result <- data.frame(log = kfold(df$X, df$Y, trans = function(x) log(x), exp), reciprocal=kfold(df$X, df$Y, function(x) 1/x, function(x) 1/x), sqrt=kfold(df$X, df$Y, sqrt, function(x) x^2), #square=kfold(df$X, df$Y, function(x) x^2, sqrt), identity=kfold(df$X, df$Y, identity, identity)) #boxcox=kfold(df$X, df$Y, function(x) boxcox_transform(x, lambda=calc_lambda(df$Y)), function(x) boxcox_inverse(x, lambda=calc_lambda(df$Y)))) #boxcox=kfold(df$X, df$Y, function(x) boxcox_transform(x, lambda=calc_lambda(df$Y)), boxcox.inv(x, calc_lambda(df$Y)))) agg.result <- data.frame(mean=colMeans(result), se=apply(result,2,function(x) sd(x)/sqrt(length(x)))) kfold_boxplot(agg.result$mean, agg.result$se, rownames(agg.result)) print(agg.result$mean) }) } shinyApp(ui, server)

#n<-40 #location<-0 #omega<-1 #alpha<-6 #sample<-rsn(n = n, xi = location, omega = omega ,alpha = alpha) generate.data<-function(sample){ X<-rnorm(length(sample),100,15) beta<-0.2 Y<-beta*X+sample outcome<-data.frame(X,Y) return(outcome) }

kfold_boxplot<-function(means, ses, transformations) { xi <- 1:length(transformations) plot(xi, means, type="n", axes=F, ylab="Mean Squared Error", xlab="Applied Transformation", ylim=c(0,max(means)+1.1*max(ses)), xlim=c(0.75, length(xi)+.25)) points(means ~ xi, pch=16, col="blue") arrows(xi, means - ses, xi, means + ses, code = 3, col = "blue", angle = 90, length = .1, lwd = 2) axis(1, labels = transformations, at = xi) axis(2, las=1) graphics::box() }

#n = sample size #xi = location #omega = scale #aplha = asymmetry #n<-20 #location<-0 #omega<-1 #alpha<-0 #trans.sample<-rsn(n = n, xi = location, omega = omega ,alpha = alpha) QQplot<-function(sample){ test<-shapiro.test(sample) main<-paste(c("W=",round(test$statistic, 2),", p=",round(test$p.value,2)), collapse="") qqnorm(sample, main=main, bty="l") qqline(sample) } #QQplot(trans.sample)

scatter_with_marginal_density<-function(data,X,Y){ #scatter_plot<-ggscatterhist( # data, x = X, y = Y, # size = 1.5, alpha = 0.4, # margin.params = list(size = 0.2,fill = "gray") #) beta<-lm(data[,Y] ~ data[,X])$coefficients[2] scatter_plot2<-ScatterHist(data,X, Y, smoothmethod="lm",title=paste0("beta=",round(beta,3))) return(scatter_plot2) }

transform_data <- function(x, method) { df <- data.frame( logx = log(x), recip = 1/x, sqrtx = sqrt(x), x2 = x**2, box = boxcox_util(x)) return(df) } boxcox_util <- function(x) { estimates <- boxcox(x ~ 1, plotit=FALSE) lambda <- estimates$x[which.max(estimates$y)] if(lambda != 0) { return((x^lambda-1)/lambda) } else { return(log(x)) } } add_bins <- function(data, n_bins, seed) { n <- nrow(data) bins <- 1:n_bins k <- ceiling(n / n_bins) set.seed(seed) data$bin <- sample(rep(bins, k)[1:n]) return(data) } split_data <- function(data, bin) { list(train = data[data$bin != bin, ], test = data[data$bin == bin, ]) } kfold<-function(x,y, trans, invtrans, k=10){ data <- data.frame(x=x,y=trans(y)) bindata<-add_bins(data, k, 1) result <- numeric(k) #c_result <- numeric(k) for(i in 1:k){ train_test<-split_data(bindata, i) train <- train_test$train test <- train_test$test lm.fit <- lm(y~x, data=train) y_hat <- invtrans(predict(lm.fit, newdata=test)) mse <- function(x1,x2) { mean((x1 - x2)^2) } result[i] <- mse(test$y, y_hat) #c_result[i] <- invtrans(coef(lm.fit)[[2]]) } return(result) #return(c(result,c_result)) } calc_lambda <- function(x) { estimates <- boxcox(x ~ 1, plotit=FALSE) lambda <- estimates$x[which.max(estimates$y)] return(lambda) } boxcox_transform <- function(x, lambda) { if(lambda != 0) { return((x^lambda-1)/lambda) } else { return(log(x)) } } # boxcox_inverse <- function(x, lambda) { # if(lambda != 0) { # return(log((x * lambda + 1), base=lambda)) # } else { # return(exp(x)) # } # }