PCA Scaling with ggbiplot
Asked Answered
S

1

8

I'm trying to plot a principal component analysis using prcomp and ggbiplot. I'm getting data values outside of the unit circle, and haven't been able to rescale the data prior to calling prcomp in such a way that I can constrain the data to the unit circle.

data(wine)
require(ggbiplot)
wine.pca=prcomp(wine[,1:3],scale.=TRUE)
ggbiplot(wine.pca,obs.scale = 1, 
         var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)

I tried scaling by subtracting mean and dividing by standard deviation before calling prcomp:

wine2=wine[,1:3]
mean=apply(wine2,2,mean)
sd=apply(wine2,2,mean)
for(i in 1:ncol(wine2)){
  wine2[,i]=(wine2[,i]-mean[i])/sd[i]
}
wine2.pca=prcomp(wine2,scale.=TRUE)
ggbiplot(wine2.pca,obs.scale=1, 
         var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)

ggbiplot package installed as follows:

require(devtools)
install_github('ggbiplot','vqv')

Output of either code chunk:

enter image description here

Per @Brian Hanson's comment below, I'm adding an additional image reflecting the output I'm trying to get.

enter image description here

Sibylle answered 4/8, 2013 at 2:57 Comment(6)
Well, the reason the two approaches give the same answer is that prcomp centers by default (see ?prcomp). So your 2nd try is just extra work that you didn't need to do. Now, on the other issue: why do you think the values should be inside a unit circle? And what values are you talking about - the scores? Can you give a citation explaining why they should be in a unit circle?Blizzard
I'll edit the post and add a comparable published plot for my expected result. You're correct on the second chunk not doing anything. I'm satisfied with the variables (arrows) being scaled correctly, it is the observations that I'm trying to constrain inside a unit circle. I'm at a bit of a loss of why the observations could be outside of a unit circle if the data is scaled in addition to being centered.Sibylle
When you scale the variables, they are individually scaled to have variance 1 and mean 0. I don't think there is any theoretical reason why the scores that result should be inside a unit circle. I just looked around at many examples in the help pages with scaling and none of them produce scores that would fall inside a unit circle. To get a fuller understanding of what pca does, I suggest getting away from biplots: they break one of the key principles of good graphics - don't have two scales on the same plot.Blizzard
This page talks about what that correlation circle tells you (nothing about the scores, it's about the loadings): xlstat.com/en/learning-center/tutorials/… Notice that the loadings and the circle are both the same color by the way.Blizzard
Does the first graph mean Alcohol, Ash and Malic Acid contributed the greatest to differentiating those wines to three categories?Heredity
No, those are the only variables used in this example. Look at where the wine.pca variable is defined. The biplot will show the contributions of each variable to the first and second principal components. Interpret it more as what the principal components represent than much the variables separate data. That question is better represented in a loadings plot.Sibylle
S
11

I edited the code for the plot function and was able to get the functionality I wanted.

ggbiplot2=function(pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE, 
         obs.scale = 1 - scale, var.scale = scale, 
         groups = NULL, ellipse = FALSE, ellipse.prob = 0.68, 
         labels = NULL, labels.size = 3, alpha = 1, 
         var.axes = TRUE, 
         circle = FALSE, circle.prob = 0.69, 
         varname.size = 3, varname.adjust = 1.5, 
         varname.abbrev = FALSE, ...)
{
  library(ggplot2)
  library(plyr)
  library(scales)
  library(grid)

  stopifnot(length(choices) == 2)

  # Recover the SVD
  if(inherits(pcobj, 'prcomp')){
    nobs.factor <- sqrt(nrow(pcobj$x) - 1)
    d <- pcobj$sdev
    u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- pcobj$rotation
  } else if(inherits(pcobj, 'princomp')) {
    nobs.factor <- sqrt(pcobj$n.obs)
    d <- pcobj$sdev
    u <- sweep(pcobj$scores, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- pcobj$loadings
  } else if(inherits(pcobj, 'PCA')) {
    nobs.factor <- sqrt(nrow(pcobj$call$X))
    d <- unlist(sqrt(pcobj$eig)[1])
    u <- sweep(pcobj$ind$coord, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- sweep(pcobj$var$coord,2,sqrt(pcobj$eig[1:ncol(pcobj$var$coord),1]),FUN="/")
  } else {
    stop('Expected a object of class prcomp, princomp or PCA')
  }

  # Scores
  df.u <- as.data.frame(sweep(u[,choices], 2, d[choices]^obs.scale, FUN='*'))

  # Directions
  v <- sweep(v, 2, d^var.scale, FUN='*')
  df.v <- as.data.frame(v[, choices])

  names(df.u) <- c('xvar', 'yvar')
  names(df.v) <- names(df.u)

  if(pc.biplot) {
    df.u <- df.u * nobs.factor
  }

  # Scale the radius of the correlation circle so that it corresponds to 
  # a data ellipse for the standardized PC scores
  r <- 1

  # Scale directions
  v.scale <- rowSums(v^2)
  df.v <- df.v / sqrt(max(v.scale))

  ## Scale Scores
  r.scale=sqrt(max(df.u[,1]^2+df.u[,2]^2))
  df.u=.99*df.u/r.scale

  # Change the labels for the axes
  if(obs.scale == 0) {
    u.axis.labs <- paste('standardized PC', choices, sep='')
  } else {
    u.axis.labs <- paste('PC', choices, sep='')
  }

  # Append the proportion of explained variance to the axis labels
  u.axis.labs <- paste(u.axis.labs, 
                       sprintf('(%0.1f%% explained var.)', 
                               100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2)))

  # Score Labels
  if(!is.null(labels)) {
    df.u$labels <- labels
  }

  # Grouping variable
  if(!is.null(groups)) {
    df.u$groups <- groups
  }

  # Variable Names
  if(varname.abbrev) {
    df.v$varname <- abbreviate(rownames(v))
  } else {
    df.v$varname <- rownames(v)
  }

  # Variables for text label placement
  df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar))
  df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar)) / 2)

  # Base plot
  g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) + 
    xlab(u.axis.labs[1]) + ylab(u.axis.labs[2]) + coord_equal()

  if(var.axes) {
    # Draw circle
    if(circle) 
    {
      theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
      circle <- data.frame(xvar = r * cos(theta), yvar = r * sin(theta))
      g <- g + geom_path(data = circle, color = muted('white'), 
                         size = 1/2, alpha = 1/3)
    }

    # Draw directions
    g <- g +
      geom_segment(data = df.v,
                   aes(x = 0, y = 0, xend = xvar, yend = yvar),
                   arrow = arrow(length = unit(1/2, 'picas')), 
                   color = muted('red'))
  }

  # Draw either labels or points
  if(!is.null(df.u$labels)) {
    if(!is.null(df.u$groups)) {
      g <- g + geom_text(aes(label = labels, color = groups), 
                         size = labels.size)
    } else {
      g <- g + geom_text(aes(label = labels), size = labels.size)      
    }
  } else {
    if(!is.null(df.u$groups)) {
      g <- g + geom_point(aes(color = groups), alpha = alpha)
    } else {
      g <- g + geom_point(alpha = alpha)      
    }
  }

  # Overlay a concentration ellipse if there are groups
  if(!is.null(df.u$groups) && ellipse) {
    theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
    circle <- cbind(cos(theta), sin(theta))

    ell <- ddply(df.u, 'groups', function(x) {
      if(nrow(x) < 2) {
        return(NULL)
      } else if(nrow(x) == 2) {
        sigma <- var(cbind(x$xvar, x$yvar))
      } else {
        sigma <- diag(c(var(x$xvar), var(x$yvar)))
      }
      mu <- c(mean(x$xvar), mean(x$yvar))
      ed <- sqrt(qchisq(ellipse.prob, df = 2))
      data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'), 
                 groups = x$groups[1])
    })
    names(ell)[1:2] <- c('xvar', 'yvar')
    g <- g + geom_path(data = ell, aes(color = groups, group = groups))
  }

  # Label the variable axes
  if(var.axes) {
    g <- g + 
      geom_text(data = df.v, 
                aes(label = varname, x = xvar, y = yvar, 
                    angle = angle, hjust = hjust), 
                color = 'darkred', size = varname.size)
  }
  # Change the name of the legend for groups
  # if(!is.null(groups)) {
  #   g <- g + scale_color_brewer(name = deparse(substitute(groups)), 
  #                               palette = 'Dark2')
  # }

  # TODO: Add a second set of axes

  return(g)
}

Sibylle answered 16/8, 2013 at 13:53 Comment(1)
+1, this fx is much improved now, all points inside, but my question, can this function accept principal object of psych package? I end up with different figures using stats::prcomp().Pentose

© 2022 - 2024 — McMap. All rights reserved.