library(shiny)
library(tidyverse)
library(sn)
library(ggpubr)
library(shinydashboard)
library(MASS)
library(WVPlots)
# setwd("~/TquanT/tquant2019-transformation")
source("QQplot.R")
source("scatter_with_marginal_density.R")
source("data_generation.R")
source("utils.R")
source("plots.R")
ui <- dashboardPage(
dashboardHeader(
title = "Data Transformations",
titleWidth = "100%"),
dashboardSidebar(
sidebarMenu(
menuItem("Introduction page", tabName = "theory",
icon = icon("info-circle")),
menuItem("Visualization", tabName = "turn", badgeLabel = "Try me!",
badgeColor = "blue", icon = icon("pencil-square-o")),
menuItem("Simulation tests", tabName = "sim", badgeLabel = "Watch me!",
badgeColor = "blue", icon = icon("pencil-square-o"))
)
),
dashboardBody(
tabItems(
tabItem(tabName = "theory",
includeScript("../../../Matomo-tquant.js"),
fluidRow(
column(7,
h3(img(src="https://gupsych.github.io/tquant/files/tquant_logo.png", width = "100", height = "100", align = "center")),
withMathJax(),
div(style = "font-size:125%",
"
This Shiny App performs five different transformation techniques and explore the strength and weakness of each method compare to the truth model and each other.
We include five transformation techniques to transform the data: Log transformation, Reciprocal transformation, Squared transformation, Square-root transformation, Box-Cox power transformation(this is a way to transform non-normal dependent variables into a normal shape).
The truth model underlying the data is Y=Bx+e, where the independent variables X is sampled from normal distribution, B is 0.2 and error term e is drew from a skewed normal distribution with user adjusted parameters: sample size and alpha(control skewness).
")
),
column(5,
box(title=h4("Developed by:"),
solidHeader = TRUE,
status = "info",
width = '12',
h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/tuebingen_logo.png?resize=56%2C100&ssl=1", width = "50", height = "55"),
" Marc Weitz"),
h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/amsterdam_logo.png?resize=100%2C100&ssl=1", width = "50"),
" Arthur Schwebke"),
h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/amsterdam_logo.png?resize=100%2C100&ssl=1", width = "50"),
" Adam Finnemann"),
h4(img(src="https://i2.wp.com/tquant.eu/wp-content/uploads/2016/08/leuven_logo.png?resize=150%2C100&ssl=1", width="50",height="45"),
" Shuo Sun")
)
)
)
),
tabItem(tabName = "turn",
fluidRow(
column(6,div(style = "height:450px;background-color: lightblue;",
"This tab shows the theoretical sampling distribution as well as the empirical sample (top-right).
The sample size and skewness of the sample can be controlled by the 'alpha' and 'sample size' parameter.
Scatter+histogram and qq plots allows for visual inspection of five transformations' effect.",
# checkboxGroupInput("trans",
# h3("Choose transformation"),
# choices = list("Log transformation" = 1,
# "Recirpocal transformation" = 2,
# "Square-root transformation" = 3,
# "sqaured transformation" = 4,
# "Box-cox transformation" = 5),
# selected = c(1,2,3,4,5)),
sliderInput("sample_size",
h3("Choose sample_size"),
min = 1, max = 5000, value = 500),
sliderInput("alpha",
h3("Choose alpha"),
min = 0, max = 150, value = 20)
)),
column(6,div(h3("Theoretical and empirical distribution"),
plotOutput("sample_distri"))
)
),
### end first row ###
fluidRow(
#condition = "input.trans %in% 1",
condition = "input.trans %in% '1'",
column(6,
h3("Log transformation"),
plotOutput("log_plot")
),
column(6,
h3("Log transformation QQ-plot"),
plotOutput("log_qq"))
),
### end second row ###
fluidRow(
column(6,
h3("Reciprocal transformation"),
plotOutput("reci_plot")
),
column(6,
h3("Reciprocal transformation QQ-plot"),
plotOutput("reci_qq"))
),
### end third row ###
fluidRow(
column(6,
h3("Squared transformation"),
plotOutput("square_plot")
),
column(6,
h3("Squared transformation QQ-plot"),
plotOutput("square_qq"))
),
### end fourth row ###
fluidRow(
column(6,
h3("Square-root transformation"),
plotOutput("sqrt_plot")
),
column(6,
h3("Square-root transformation QQ-plot"),
plotOutput("sqrt_qq"))
),
### end fith row ###
fluidRow(
column(6,
h3("Box transformation"),
plotOutput("boxcox_plot")
),
column(6,
h3("Box transformation QQ-plot"),
plotOutput("boxcox_qq"))
)
),
tabItem(tabName = "sim",
column(3,
"The plot shows the MSE of a 10 fold cross-validation for each model. Each model is fitted on differently transformed data (log, reciprocal, squared, square-root, identity and boxcox"
),
column(9,
plotOutput("cv"))
)
)
)
)
server <- function(input, output, session) {
sample <- reactive({
print(input$trans)
sample <- rsn(n = input$sample_size, xi= 0, omega = 20, alpha = input$alpha, tau=0, dp=NULL)
df <- generate.data(sample)
df <- cbind(df, transform_data(df$Y))
})
output$sample_distri <- renderPlot({
scatter_with_marginal_density(sample(),"X","Y")
})
output$log_qq <- renderPlot({
QQplot(sample()$logx)
})
output$log_plot <- renderPlot({
scatter_with_marginal_density(sample(),"X","logx")
})
output$reci_qq <- renderPlot({
QQplot(sample()$recip)
})
output$reci_plot <- renderPlot({
scatter_with_marginal_density(sample(),"X","recip")
})
output$sqrt_qq <- renderPlot({
QQplot(sample()$sqrtx)
})
output$sqrt_plot <- renderPlot({
scatter_with_marginal_density(sample(),"X","sqrtx")
})
output$square_qq <- renderPlot({
QQplot(sample()$x2)
})
output$square_plot <- renderPlot({
scatter_with_marginal_density(sample(),"X","x2")
})
output$boxcox_qq <- renderPlot({
QQplot(sample()$box)
})
output$boxcox_plot <- renderPlot({
scatter_with_marginal_density(sample(),"X","box")
})
output$cv <-renderPlot({
#sample <- rsn(n = 50, xi= 0, omega = 20, alpha = 30, tau=0, dp=NULL)
#df <- generate.data(sample)
#df <- cbind(df, transform_data(df$Y))
df <- sample()
result <- data.frame(log = kfold(df$X, df$Y, trans = function(x) log(x), exp),
reciprocal=kfold(df$X, df$Y, function(x) 1/x, function(x) 1/x),
sqrt=kfold(df$X, df$Y, sqrt, function(x) x^2),
#square=kfold(df$X, df$Y, function(x) x^2, sqrt),
identity=kfold(df$X, df$Y, identity, identity))
#boxcox=kfold(df$X, df$Y, function(x) boxcox_transform(x, lambda=calc_lambda(df$Y)), function(x) boxcox_inverse(x, lambda=calc_lambda(df$Y))))
#boxcox=kfold(df$X, df$Y, function(x) boxcox_transform(x, lambda=calc_lambda(df$Y)), boxcox.inv(x, calc_lambda(df$Y))))
agg.result <- data.frame(mean=colMeans(result),
se=apply(result,2,function(x) sd(x)/sqrt(length(x))))
kfold_boxplot(agg.result$mean, agg.result$se, rownames(agg.result))
print(agg.result$mean)
})
}
shinyApp(ui, server)
#n<-40
#location<-0
#omega<-1
#alpha<-6
#sample<-rsn(n = n, xi = location, omega = omega ,alpha = alpha)
generate.data<-function(sample){
X<-rnorm(length(sample),100,15)
beta<-0.2
Y<-beta*X+sample
outcome<-data.frame(X,Y)
return(outcome)
}
kfold_boxplot<-function(means, ses, transformations) {
xi <- 1:length(transformations)
plot(xi, means, type="n", axes=F,
ylab="Mean Squared Error", xlab="Applied Transformation",
ylim=c(0,max(means)+1.1*max(ses)), xlim=c(0.75, length(xi)+.25))
points(means ~ xi, pch=16, col="blue")
arrows(xi, means - ses, xi, means + ses,
code = 3, col = "blue", angle = 90, length = .1,
lwd = 2)
axis(1, labels = transformations, at = xi)
axis(2, las=1)
graphics::box()
}
#n = sample size
#xi = location
#omega = scale
#aplha = asymmetry
#n<-20
#location<-0
#omega<-1
#alpha<-0
#trans.sample<-rsn(n = n, xi = location, omega = omega ,alpha = alpha)
QQplot<-function(sample){
test<-shapiro.test(sample)
main<-paste(c("W=",round(test$statistic, 2),", p=",round(test$p.value,2)), collapse="")
qqnorm(sample, main=main, bty="l")
qqline(sample)
}
#QQplot(trans.sample)
scatter_with_marginal_density<-function(data,X,Y){
#scatter_plot<-ggscatterhist(
# data, x = X, y = Y,
# size = 1.5, alpha = 0.4,
# margin.params = list(size = 0.2,fill = "gray")
#)
beta<-lm(data[,Y] ~ data[,X])$coefficients[2]
scatter_plot2<-ScatterHist(data,X, Y,
smoothmethod="lm",title=paste0("beta=",round(beta,3)))
return(scatter_plot2)
}
transform_data <- function(x, method) {
df <- data.frame(
logx = log(x),
recip = 1/x,
sqrtx = sqrt(x),
x2 = x**2,
box = boxcox_util(x))
return(df)
}
boxcox_util <- function(x) {
estimates <- boxcox(x ~ 1, plotit=FALSE)
lambda <- estimates$x[which.max(estimates$y)]
if(lambda != 0) {
return((x^lambda-1)/lambda)
} else {
return(log(x))
}
}
add_bins <- function(data, n_bins, seed) {
n <- nrow(data)
bins <- 1:n_bins
k <- ceiling(n / n_bins)
set.seed(seed)
data$bin <- sample(rep(bins, k)[1:n])
return(data)
}
split_data <- function(data, bin) {
list(train = data[data$bin != bin, ],
test = data[data$bin == bin, ])
}
kfold<-function(x,y, trans, invtrans, k=10){
data <- data.frame(x=x,y=trans(y))
bindata<-add_bins(data, k, 1)
result <- numeric(k)
#c_result <- numeric(k)
for(i in 1:k){
train_test<-split_data(bindata, i)
train <- train_test$train
test <- train_test$test
lm.fit <- lm(y~x, data=train)
y_hat <- invtrans(predict(lm.fit, newdata=test))
mse <- function(x1,x2) {
mean((x1 - x2)^2)
}
result[i] <- mse(test$y, y_hat)
#c_result[i] <- invtrans(coef(lm.fit)[[2]])
}
return(result)
#return(c(result,c_result))
}
calc_lambda <- function(x) {
estimates <- boxcox(x ~ 1, plotit=FALSE)
lambda <- estimates$x[which.max(estimates$y)]
return(lambda)
}
boxcox_transform <- function(x, lambda) {
if(lambda != 0) {
return((x^lambda-1)/lambda)
} else {
return(log(x))
}
}
# boxcox_inverse <- function(x, lambda) {
# if(lambda != 0) {
# return(log((x * lambda + 1), base=lambda))
# } else {
# return(exp(x))
# }
# }