visual structure of a data.frame: locations of NAs and much more

Asked 18/12, 2014 at 11:23 Answered 30/8, 2017 at 4:21

Solved r dataframe ggplot2 na missing-data

I want to represent the structure of a data frame (or matrix, or data.table whatever) on a single plot with color-coding. I guess that could be very useful for many people handling various types of data, to visualize it in a single glance.

Perhaps someone have already developed a package to do it, but I couldn't find one (just this). So here is a rough mockup of my "vision", kind of a heatmap, showing in color codes:

the NA locations,
the class of variables (factors (how many levels?), numeric (with color gradient, zeros, outliers...), strings)
dimensions
etc.....

enter image description here

So far I have just written a function to plot the NA locations it goes like this:

ggSTR = function(data, alpha=0.5){
  require(ggplot2)
  DF <- data
  if (!is.matrix(data)) DF <- as.matrix(DF)

  to.plot <- cbind.data.frame('y'=rep(1:nrow(DF), each=ncol(DF)), 
                              'x'=as.logical(t(is.na(DF)))*rep(1:ncol(DF), nrow(DF)))
  size <- 20 / log( prod(dim(DF)) )  # size of point depend on size of table
  g <- ggplot(data=to.plot) + aes(x,y) +
        geom_point(size=size, color="red", alpha=alpha) +
        scale_y_reverse() + xlim(1,ncol(DF)) +
        ggtitle("location of NAs in the data frame")

  pc <- round(sum(is.na(DF))/prod(dim(DF))*100, 2) # % NA
  print(paste("percentage of NA data: ", pc))

  return(g)
}

It takes any data.frame in input and returns this image:

enter image description here

It's too big a challenge for me to achieve the first image.

Lection answered 18/12, 2014 at 11:23 Comment(2)

the rdataviewer package might provide a useful starting point – Elyseelysee 18/12, 2014 at 22:57

A related SO post has popped up and might be of interest: Inspecting and visualizing gaps/blanks and structure in large dataframes – Crista 28/3, 2015 at 20:33

eventually I come up with a script to plot most of the specifications. I submit it here, some might be interested although the syntax is far from being "elegant"!

Note that the main function 'colstr' has 3 arguments: - an input (df or matrix or even single vector) - a maximum row number to plot - an option to export to png into the working directory.

the output gives, for instance: enter image description here

# PACKAGES 
require(ggplot2)
require(RColorBrewer)
require(reshape2)

# Test if an object is empty (data.frame, matrix, vector)
is.empty = function (input) {
  df <- data.frame(input)
  (is.null(df) || nrow(df) == 0 || ncol(df) == 0 || NROW(df) == 0)
}

#  min/max normalization (R->[0;1]), (all columns must be numerical)
minmax <- function(data, ...) {
  .minmax = function(x) (x-min(x, ...))/(max(x, ...)-min(x, ...))
  # find constant columns, replaces with O.5:
  constant <- which(apply(data, 2, function(u) {min(u, ...)==max(u, ...)}))
  if(is.vector(data)) {
    res <- .minmax(data)
  } else {
    res <- apply(data, 2, .minmax)
  }
  res[, constant] <- 0.5
  return(res)
}

# MAIN function
colstr = function(input, size.max=500, export=FALSE) {
  data      <- as.data.frame(input)
  if (NCOL(data) == 1) {
    data    <- cbind(data, data)
    message("warning: input data is a vector")
  }
  miror     <- data # miror data.frame will contain a coulour coding for all cells
  wholeNA   <- which(sapply(miror, function(x) all(is.na(x))))
  whole0    <- which(sapply(miror, function(x) all(x==0)))
  numeric   <- which(sapply(data, is.numeric))
  character <- which(sapply(data, is.character))
  factor    <- which(sapply(data, is.factor))
  # characters to code
  miror[character] <- 12 
  # factor coding
  miror[factor] <- 11
  # min/max normalization, coerce it into 9 classes.
  if (!is.empty(numeric)) {miror[numeric] <- minmax(miror[numeric], na.rm=T)}
  miror[numeric] <- data.frame(lapply(miror[numeric], function(x) cut(x, breaks=9, labels=1:9))) # 9 classes numériques
  miror <- data.frame(lapply(miror, as.numeric))
  # Na coding
  miror[is.na(data)] <- 10
  miror[whole0]    <- 13
  # color palette vector
  mypalette <- c(brewer.pal(n=9, name="Blues"), "red", "green", "purple", "grey")
  colnames <- c(paste0((1:9)*10, "%"), "NA", "factor (lvls)", "character", "zero")
  # subset if too large
  couper <- nrow(miror) > size.max
  if (couper) miror <- head(miror, size.max)
  # plot
  g <- ggplot(data=melt(as.matrix(unname(miror)))) + 
    geom_tile(aes(x=Var2, y=Var1, fill=factor(value, levels=1:13))) +
    scale_fill_manual("legend", values=mypalette, labels=colnames, drop=FALSE) +
    ggtitle(paste("graphical structure of", deparse(substitute(input)), paste(dim(input), collapse="X"), ifelse(couper, "(truncated)", ""))) +
    xlab("columns of the dataframe") + ylab("rows of the dataframe") +
    geom_point(data=data.frame(x=0, y=1:NROW(input)), aes(x,y), alpha=1-all(row.names(input)==seq(1, NROW(input)))) +
    scale_y_reverse(limits=c(min(size.max, nrow(miror)), 0))
  if (!is.empty(factor)) {
    g <- g + geom_text(data=data.frame(x     = factor, 
                                       y     = round(runif(length(factor), 2, NROW(miror)-2)), 
                                       label = paste0("(", sapply(data[factor], function(x) length(levels(x))), ")")),
                       aes(x=x, y=y, label=label))
  }
  if (export) {png("colstr_output.png"); print(g); dev.off()}
  return(g)
}

Lection answered 2/3, 2015 at 14:37 Comment(0)

I know there is a package that shows missing values easily, but my google-fu is not very good at the moment. I did find, however, a function called tableplot, which will give you a grand overview of your data frame. I don't know whether or not it will show you missing data.

Here's the link:

http://www.ancienteco.com/2012/05/quickly-visualize-your-whole-dataset.html

Oleneolenka answered 18/12, 2014 at 14:45 Comment(2)

just tested it, really nice work here; yet it might be a little more "downstream" in the exploratory analysis process, since it sorts the data and changes the structure to display histograms already. +1 – Lection 18/12, 2014 at 22:39

library(DescTools) now has a function for this: PlotMiss – Marissamarist 28/10, 2015 at 8:57

You can try out visdat package(https://github.com/ropensci/visdat), which shows the NA values and data types in the plot

install.packages("visdat")
library(visdat)
vis_dat(airquality)

Thymol answered 30/8, 2017 at 4:21 Comment(1)

thanks for the answer, nice package and the output is quite neat, is it recent? – Lection 31/8, 2017 at 7:50

Have you encountered the CSV fingerprint service? It creates a similar image, althought not with all the details you have outlined above, and it's not based on R. There is an R version of a similar idea at R-ohjelmointi.org, but the text is in Finnish. The main function is csvSormenjalki(). Maybe that could be adapted further to fulfill your whole vision?

Isadora answered 18/12, 2014 at 11:39 Comment(1)

indeed, it's a nice tool that i didn't know of, thanks. I guess the challenge is to adapt it to R, as you say – Lection 18/12, 2014 at 11:55

eventually I come up with a script to plot most of the specifications. I submit it here, some might be interested although the syntax is far from being "elegant"!

Note that the main function 'colstr' has 3 arguments: - an input (df or matrix or even single vector) - a maximum row number to plot - an option to export to png into the working directory.

the output gives, for instance: enter image description here

# PACKAGES 
require(ggplot2)
require(RColorBrewer)
require(reshape2)

# Test if an object is empty (data.frame, matrix, vector)
is.empty = function (input) {
  df <- data.frame(input)
  (is.null(df) || nrow(df) == 0 || ncol(df) == 0 || NROW(df) == 0)
}

#  min/max normalization (R->[0;1]), (all columns must be numerical)
minmax <- function(data, ...) {
  .minmax = function(x) (x-min(x, ...))/(max(x, ...)-min(x, ...))
  # find constant columns, replaces with O.5:
  constant <- which(apply(data, 2, function(u) {min(u, ...)==max(u, ...)}))
  if(is.vector(data)) {
    res <- .minmax(data)
  } else {
    res <- apply(data, 2, .minmax)
  }
  res[, constant] <- 0.5
  return(res)
}

# MAIN function
colstr = function(input, size.max=500, export=FALSE) {
  data      <- as.data.frame(input)
  if (NCOL(data) == 1) {
    data    <- cbind(data, data)
    message("warning: input data is a vector")
  }
  miror     <- data # miror data.frame will contain a coulour coding for all cells
  wholeNA   <- which(sapply(miror, function(x) all(is.na(x))))
  whole0    <- which(sapply(miror, function(x) all(x==0)))
  numeric   <- which(sapply(data, is.numeric))
  character <- which(sapply(data, is.character))
  factor    <- which(sapply(data, is.factor))
  # characters to code
  miror[character] <- 12 
  # factor coding
  miror[factor] <- 11
  # min/max normalization, coerce it into 9 classes.
  if (!is.empty(numeric)) {miror[numeric] <- minmax(miror[numeric], na.rm=T)}
  miror[numeric] <- data.frame(lapply(miror[numeric], function(x) cut(x, breaks=9, labels=1:9))) # 9 classes numériques
  miror <- data.frame(lapply(miror, as.numeric))
  # Na coding
  miror[is.na(data)] <- 10
  miror[whole0]    <- 13
  # color palette vector
  mypalette <- c(brewer.pal(n=9, name="Blues"), "red", "green", "purple", "grey")
  colnames <- c(paste0((1:9)*10, "%"), "NA", "factor (lvls)", "character", "zero")
  # subset if too large
  couper <- nrow(miror) > size.max
  if (couper) miror <- head(miror, size.max)
  # plot
  g <- ggplot(data=melt(as.matrix(unname(miror)))) + 
    geom_tile(aes(x=Var2, y=Var1, fill=factor(value, levels=1:13))) +
    scale_fill_manual("legend", values=mypalette, labels=colnames, drop=FALSE) +
    ggtitle(paste("graphical structure of", deparse(substitute(input)), paste(dim(input), collapse="X"), ifelse(couper, "(truncated)", ""))) +
    xlab("columns of the dataframe") + ylab("rows of the dataframe") +
    geom_point(data=data.frame(x=0, y=1:NROW(input)), aes(x,y), alpha=1-all(row.names(input)==seq(1, NROW(input)))) +
    scale_y_reverse(limits=c(min(size.max, nrow(miror)), 0))
  if (!is.empty(factor)) {
    g <- g + geom_text(data=data.frame(x     = factor, 
                                       y     = round(runif(length(factor), 2, NROW(miror)-2)), 
                                       label = paste0("(", sapply(data[factor], function(x) length(levels(x))), ")")),
                       aes(x=x, y=y, label=label))
  }
  if (export) {png("colstr_output.png"); print(g); dev.off()}
  return(g)
}

Lection answered 2/3, 2015 at 14:37 Comment(0)

Recommended topics

Hot tags