Plotting confidence intervals with NA values
Asked Answered
V

1

17

I would like to plot confidence intervals to a data with NAs, using Gviz package. I modified manual example to expose my problem. First as the manual expose:

library(Gviz)

## Loading GRanges object
data(twoGroups)

## Plot data without NAs
dTrack <- DataTrack(twoGroups, name = "uniform")
tiff("Gviz_original.tiff", units="in", width=11, height=8.5, res=200, compress="lzw")
plotTracks(dTrack, groups = rep(c("control", "treated"),
                                each = 3), type = c("a", "p", "confint"))
graphics.off()

enter image description here Now, using data with NA values and na.rm=TRUE statment:

  ## Transforming in data frame
  df <- as.data.frame(twoGroups)

## Input NAs to look like my real data
df[ df <= 0 ] = NA
df <- df[,-4]
df <- df[,-4]
names(df) <- c("chr", "start", "end", "control", "control.1", "control.2", "treated", "treated.1", "treated.2")

## Plot with NA
library(GenomicRanges)
df <- makeGRangesFromDataFrame(df, TRUE)
dftrack <- DataTrack(df, name = "uniform")
tiff("Gviz_NA.tiff", units="in", width=11, height=8.5, res=200, compress="lzw")
plotTracks(dftrack, groups = rep(c("control", "treated"),
                                 each = 3), type = c("a", "p", "confint"), na.rm=TRUE)
graphics.off()

enter image description here

Please note that I included the na.rm=TRUE statment in plotTracks function, which allowed the computation of the line following the mean. However, the shaded area which represents the confidence interval, can´t be estimated where I have NA values, even with the na.rm=TRUE.
Any ideas to deal with this issue? Thank you!

UPDATE to @rbatt:

> dput(twoGroups)
new("GRanges"
    , seqnames = new("Rle"
    , values = structure(1L, .Label = "chrX", class = "factor")
    , lengths = 25L
    , elementMetadata = NULL
    , metadata = list()
)
    , ranges = new("IRanges"
    , start = c(1L, 42L, 84L, 125L, 167L, 209L, 250L, 292L, 334L, 375L, 417L, 
458L, 500L, 542L, 583L, 625L, 667L, 708L, 750L, 791L, 833L, 875L, 
916L, 958L, 1000L)
    , width = c(30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 
30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L
)
    , NAMES = NULL
    , elementType = "integer"
    , elementMetadata = NULL
    , metadata = list()
)
    , strand = new("Rle"
    , values = structure(3L, .Label = c("+", "-", "*"), class = "factor")
    , lengths = 25L
    , elementMetadata = NULL
    , metadata = list()
)
    , elementMetadata = new("DataFrame"
    , rownames = NULL
    , nrows = 25L
    , listData = structure(list(control = c(-8.96125989500433, -4.2114706709981, 
2.28711236733943, 9.20983788557351, 0.406841854564846, 5.90989288408309, 
5.20958516281098, 2.78549935668707, -8.57040509115905, -8.43395926523954, 
-8.77848833333701, -2.30348631739616, 0.988166537135839, -0.557612692937255, 
-7.67730884253979, -5.16523499507457, -3.01896842662245, -3.11802179086953, 
-7.91133752092719, 3.95565569866449, 2.71242363378406, 0.727043347433209, 
7.3868807638064, -5.54162500426173, -1.13912807777524), control.1 = c(-7.65790161676705, 
4.6882571419701, 8.01326935179532, -6.23242623638362, -7.05442394595593, 
-5.10347711388022, -9.60906079504639, -4.69888434745371, -5.72342518251389, 
5.06623945198953, -2.53558184020221, 5.75232566334307, -7.08328293636441, 
-5.78988547902554, 1.57217930071056, -6.07197678647935, -7.39777445793152, 
5.28266688808799, -0.175534035079181, 5.19415136426687, 7.53853759262711, 
-0.950022372417152, 4.8170017497614, -2.23117967601866, 2.86112546455115
), control.2 = c(9.87956526689231, -1.0533055011183, -7.1219984581694, 
8.59682233538479, -0.551973707042634, 1.56467542983592, -0.415736702270806, 
1.69801083859056, 3.67223800625652, -1.30616669543087, -5.99444826599211, 
-0.745276440866292, -4.42522280383855, -9.33690558653325, 3.56628117151558, 
8.04066675715148, 5.54990579374135, 7.0927129406482, -2.37754446454346, 
-5.13221249915659, 6.56280730385333, -7.63786241877824, 3.64003846421838, 
-4.65625441167504, 8.1775445304811), treated = c(-5.84375557024032, 
1.03083667811006, -4.46718293242157, -6.32041404955089, 9.36362744309008, 
-0.488725560717285, -9.12991860881448, 6.98352626990527, 3.66103118285537, 
6.59625696251169, 26.3747013662942, 4.21735171694309, 23.1465750234202, 
5.14831536915153, 16.2545943120494, -2.77631865814328, 8.87154446449131, 
4.34142326004803, 0.0693343719467521, -5.7483538496308, -3.42396105173975, 
-28.9633466186933, -7.59088161867112, 7.04729768447578, -5.34924863371998
), treated.1 = c(9.71352839842439, -6.77430204115808, -4.05887754634023, 
-1.56806231010705, -4.88056596834213, 6.99816173873842, 4.07760242931545, 
-9.04069183394313, 23.9087636698969, 20.8488084585406, 24.4913479057141, 
9.37918818555772, 21.6068591410294, 0.408056953456253, 20.2703413087875, 
-3.44990291167051, -9.94784070644528, 5.36248424556106, 5.6652726046741, 
-20.9520940342918, -25.0159116648138, -15.0660670618527, 5.14691891148686, 
-7.55597376730293, 0.874496018514037), treated.2 = c(9.99328563921154, 
0.593712376430631, 8.05319488979876, 3.5114610241726, 1.55288028530777, 
-2.03484911937267, 3.07067603804171, -2.71020049229264, 21.1088214861229, 
11.0598625196144, 10.9187916945666, 7.2046619025059, 29.7064534015954, 
1.79014495806769, 7.76732922066003, 8.54645798448473, 5.30277661513537, 
-4.55057015176862, 8.73211439698935, -20.1880806474946, -14.8638874059543, 
-26.3618095312268, -5.80431585200131, -8.46893921960145, -6.32030902896076
)), .Names = c("control", "control.1", "control.2", "treated", 
"treated.1", "treated.2"))
    , elementType = "ANY"
    , elementMetadata = NULL
    , metadata = list()
)
    , seqinfo = new("Seqinfo"
    , seqnames = "chrX"
    , seqlengths = NA_integer_
    , is_circular = NA
    , genome = "hg19"
)
    , metadata = list()
)
Verrucose answered 2/8, 2015 at 14:30 Comment(10)
Pre-process the data to remove the NA's.Garett
I am not sure how to remove NA´s in 'df' object producing a suitable matrix to my target plot. Have you some code idea?Verrucose
Try data[complete.cases(data),]Osteomyelitis
@rbatt, I lost all rows: ## Transforming in data frame df <- as.data.frame(twoGroups) ## Input NAs to look like my real data df[ df <= 0 ] = NA df <- df[,-4] df <- df[,-4] names(df) <- c("chr", "start", "end", "control", "control.1", "control.2", "treated", "treated.1", "treated.2") ## Plot with NA df <- df[complete.cases(df),] > nrow(df) [1] 0Verrucose
Any chance you could share some of the output of dput(twoGroups)? I'm not terribly familiar with the package you're using, and may have to experiment.Osteomyelitis
Installing Gviz package you can load the data just by data(twoGroups). However, I included the dput(twoGroups) result in question main text. Please check there!Verrucose
What's the desired behaviour—are you looking to remove the data points that don't have corresponding intervals entirely? Having the points plotted with no interval in those domains seems acceptable to me, but I'm not sure what you're looking for.Trudge
@rensa, each row in 'df' have an coordinate with 6 values (3 control and 3 treated). When one of this values is NA, the shaded area is not ploted because confidence interval (CI) is not computated. I would like to allow the computation of the CI even when I get NA values, that´s my problem.Verrucose
Ah, I see—and I presume Gviz is calculating the CIs based on those values, rather than you precalculating the CIs and feeding them in?Trudge
Yes, the line for example represents the mean, but it just is ploted if 'rm.na=TRUE'. I am wondering if there exist a similiar statment for CI calculation.Verrucose
M
1

You could just drop the NAs from the dataframe before you go to plot or impute the values if you're willing to modify the data's structure. You might have to remove the NA's column by column.

Like this:

First, make a pretty dataframe:

df<- data.frame(userid=seq(1,100,1), numVarA=rnorm(100, mean=0, sd=1), numVarB=rnorm(100, mean=2, sd=1),  wholeNumVar=seq(from=1, to=300, by=3), Sex=rep(c("Male", "Female"), 50), Age=floor(runif(100, min=30, max=55)))

Next, punch some holes in it.

df$numVarA[c(1, 10, 15, 20, 25, 27, 29, 44, 69, 96, 45)]<- NA
df$numVarB[c(12, 80, 17, 19, 77, 71, 74, 76)]<- NA

Third, drop the NA's

df<- df[!is.na(df$numVarA), ]
df<- df[!is.na(df$numVarB), ]

Then try to plot everything again. Hopefully this helps. Best, NF

Megaphone answered 14/8, 2015 at 20:12 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.