ggplot with missing values - r

I have a dataset like this
data <- data.frame(Time=as.Date(c("2007-01-31", "2007-02-28", "2007-03-31", "2007-04-30",
"2007-05-31", "2007-06-30", "2007-07-31", "2007-08-31", "2007-09-30", "2007-10-31",
"2007-11-30", "2007-12-31"), format="%Y-%m-%d"),
a=c(NA, NA, 1.201, NA, NA, 1.206, NA, NA, 1.212, NA, NA, 1.237),
b=c(1.187, 1.201, 1.206, 1.219, 1.211, 1.187, 1.202, 1.213, 1.209, 1.208,
1.219, 1.237),
c=c(1.198, 1.201, 1.203, 1.206, 1.207,1.207, 1.208, 1.21, 1.214, 1.22, 1.228, 1.236))
> data
Time a b c
1 2007-01-31 NA 1.187 1.198
2 2007-02-28 NA 1.201 1.201
3 2007-03-31 1.201 1.206 1.203
4 2007-04-30 NA 1.219 1.206
5 2007-05-31 NA 1.211 1.207
6 2007-06-30 1.206 1.187 1.207
7 2007-07-31 NA 1.202 1.208
8 2007-08-31 NA 1.213 1.210
9 2007-09-30 1.207 1.209 1.214
10 2007-10-31 NA 1.208 1.220
11 2007-11-30 NA 1.219 1.228
12 2007-12-31 1.219 1.237 1.236
I want a ggplot with points for a and lines for b & c. I came up with
ggplot(data, aes(x=Time)) + geom_point(aes(y=a, colour="a"), size=3) +
geom_line(aes(y=b, colour="b")) + geom_line(aes(y=c, colour="c"))
The problem is, that I have many more variables like b & c, which I want to plot. Is there a more efficient way to do it?

One way of doing it is with the tidyr package (and some dplyr)
library('dplyr')
library('tidyr')
data2 <- gather(data, grp, value, a:c)
ggplot(data2, aes(x = Time, y = value, color = grp, group = grp)) +
geom_point(data = filter(data2, grp == 'a'), size = 3) +
geom_line()
The plot

Related

Convert cox regression table to forest plot

I want to convert a cox table to forest plot as showed below. Unforunatly I’ve lost my original data (coxph object) so I have to use the data from the table. Data below are just examples:
Desired output:
Reprex for the two tables:
GRP1<-tibble::tribble(
~Variable, ~Level, ~Number, ~`HR.(univariable)`, ~`HR.(multivariable)`,
"Sex", "Female", "2204 (100.0)", NA, NA,
NA, "Male", "2318 (100.0)", "1.13 (0.91-1.40, p=0.265)", "1.13 (0.91-1.40, p=0.276)",
"Score", "1", "2401 (100.0)", NA, NA,
NA, "1-2", "1637 (100.0)", "1.49 (1.19-1.87, p=0.001)", "1.15 (0.90-1.47, p=0.250)",
NA, "3-4", "412 (100.0)", "1.71 (1.14-2.56, p=0.010)", "1.09 (0.71-1.67, p=0.710)",
NA, ">=5", "42 (100.0)", "1.67 (0.53-5.21, p=0.381)", "0.96 (0.30-3.05, p=0.943)",
"Treatment", "A", "1572 (100.0)", NA, NA,
NA, "B", "2951 (100.0)", "1.74 (1.26-2.40, p=0.001)", "1.53 (1.09-2.13, p=0.013)"
)
GRP2<-tibble::tribble(
~Variable, ~Level, ~Number, ~`HR.(univariable)`, ~`HR.(univariable)`,
"Sex", "Female", "2204 (100.0)", NA, NA,
NA, "Male", "2318 (100.0)", "1.70 (1.36-2.13, p<0.001)", "1.62 (1.28-2.04, p<0.001)",
"Score", "1", "2401 (100.0)", NA, NA,
NA, "1-2", "1637 (100.0)", "2.76 (1.21-6.29, p=0.016)", "2.69 (1.18-6.13, p=0.019)",
NA, "3-4", "412 (100.0)", "5.11 (2.26-11.58, p<0.001)", "4.46 (1.95-10.23, p<0.001)",
NA, ">=5", "42 (100.0)", "5.05 (2.19-11.64, p<0.001)", "4.08 (1.73-9.59, p=0.001)",
"Treatment", "A", "1572 (100.0)", NA, NA,
NA, "B", "2951 (100.0)", "1.48 (1.16-1.88, p=0.001)", "1.23 (0.95-1.59, p=0.114)"
)
Is it doable?
Best regards, H
The difficult thing about this task is not making the plot; it is converting your data from a bunch of text strings into a single long-format data frame that can be used for plotting. This involves using regular expressions to capture the appropriate number for each column, pivoting the result, then repeating that process for the second data frame before binding the two frames together. This is unavoidably ugly and complicated, but that is one of the reasons why having data stored in the correct format is so important.
Anyway, the following code performs the necessary operations:
library(dplyr)
wrangler <- function(data){
grp <- as.character(match.call()$data)
data %>%
tidyr::fill(Variable) %>%
mutate(Variable = paste(Variable, Level),
Number = as.numeric(gsub("^(\\d+).*$", "\\1", Number)),
univariable_HR = as.numeric(gsub("^((\\d+|\\.)+).*$", "\\1", `HR.(univariable)`)),
univariable_lower = as.numeric(gsub("^.+? \\((.+?)-.*$", "\\1", `HR.(univariable)`)),
univariable_upper = as.numeric(gsub("^.+?-(.+?),.*$", "\\1", `HR.(univariable)`)),
univariable_p = gsub("^.+?p=*(.+?)\\).*$", "\\1", `HR.(univariable)`),
multivariable_HR = as.numeric(gsub("^((\\d+|\\.)+).*$", "\\1", `HR.(multivariable)`)),
multivariable_lower = as.numeric(gsub("^.+? \\((.+?)-.*$", "\\1", `HR.(multivariable)`)),
multivariable_upper = as.numeric(gsub("^.+?-(.+?),.*$", "\\1", `HR.(multivariable)`)),
multivariable_p = gsub("^.+?p=*(.+?)\\).*$", "\\1", `HR.(multivariable)`),
group = grp) %>%
filter(!is.na(univariable_HR)) %>%
select(-Level, -`HR.(multivariable)`, - `HR.(univariable)`) %>%
tidyr::pivot_longer(cols = -(c(1:2, 11)), names_sep = "_", names_to = c("type", ".value"))
}
df <- rbind(wrangler(GRP1), wrangler(GRP2))
This now gives us the data in the correct format for plotting. Each row will become a single pointrange in our plot, so it needs a hazard ratio, a lower confidence bound, an upper confidence bound, a variable label, the type (multivariable versus univariable), and the group it originally came from (GRP1 or GRP2):
df
#> # A tibble: 20 x 8
#> Variable Number group type HR lower upper p
#> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>
#> 1 Sex Male 2318 GRP1 univariable 1.13 0.91 1.4 0.265
#> 2 Sex Male 2318 GRP1 multivariable 1.13 0.91 1.4 0.276
#> 3 Score 1-2 1637 GRP1 univariable 1.49 1.19 1.87 0.001
#> 4 Score 1-2 1637 GRP1 multivariable 1.15 0.9 1.47 0.250
#> 5 Score 3-4 412 GRP1 univariable 1.71 1.14 2.56 0.010
#> 6 Score 3-4 412 GRP1 multivariable 1.09 0.71 1.67 0.710
#> 7 Score >=5 42 GRP1 univariable 1.67 0.53 5.21 0.381
#> 8 Score >=5 42 GRP1 multivariable 0.96 0.3 3.05 0.943
#> 9 Treatment B 2951 GRP1 univariable 1.74 1.26 2.4 0.001
#> 10 Treatment B 2951 GRP1 multivariable 1.53 1.09 2.13 0.013
#> 11 Sex Male 2318 GRP2 univariable 1.7 1.36 2.13 <0.001
#> 12 Sex Male 2318 GRP2 multivariable 1.62 1.28 2.04 <0.001
#> 13 Score 1-2 1637 GRP2 univariable 2.76 1.21 6.29 0.016
#> 14 Score 1-2 1637 GRP2 multivariable 2.69 1.18 6.13 0.019
#> 15 Score 3-4 412 GRP2 univariable 5.11 2.26 11.6 <0.001
#> 16 Score 3-4 412 GRP2 multivariable 4.46 1.95 10.2 <0.001
#> 17 Score >=5 42 GRP2 univariable 5.05 2.19 11.6 <0.001
#> 18 Score >=5 42 GRP2 multivariable 4.08 1.73 9.59 0.001
#> 19 Treatment B 2951 GRP2 univariable 1.48 1.16 1.88 0.001
#> 20 Treatment B 2951 GRP2 multivariable 1.23 0.95 1.59 0.114
Now that we have the data in this format, the plot itself is straightforward:
library(ggplot2)
ggplot(df, aes(HR, Variable)) +
geom_pointrange(aes(xmin = lower, xmax = upper, colour = type),
position = position_dodge(width = 0.5)) +
facet_grid(group~., switch = "y") +
geom_vline(xintercept = 0, linetype = 2) +
theme_bw() +
theme(strip.placement = "outside",
strip.text= element_text(angle = 180),
strip.background = element_blank(),
panel.spacing = unit(0, "mm"))
Created on 2021-11-01 by the reprex package (v2.0.0)

Calculation cumulated values using grouping

I am trying to calculate cumulated acetone and acetaldehyde emission from different soil incubations across three time points. Emission of the compounds was measured from six soils (of different soil_types) on three days. I wish to calculate the cumulated emission for each soil for each time point.
The end goal is to calculate the average emission from all soils and present a graph similar to this one (except there should be error bars on my graph):
Can anyone spot where I'm going wrong?
Here's the code:
library(tidyverse)
library(plotrix)
df%>%
group_by(soil, compound, days)%>%
mutate(cum_emission=cumsum(emission))%>%
summarise(mean=mean(cum_emission, na.rm = TRUE),
sd = sd(cum_emission, na.rm = TRUE),
se = std.error(cum_emission, na.rm = TRUE))
Here's the data:
df <- structure(list(days = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,
4, 4, 4, 4, 4, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 4, 4, 4, 4), soil = c(12, 12, 2, 2, 1, 1, 9, 9, 13, 13,
3, 3, 12, 12, 2, 2, 1, 1, 9, 9, 12, 12, 2, 2, 1, 1, 9, 9, 13,
13, 3, 3, 13, 13, 3, 3), soil_type = c("organic", "organic",
"mineral", "mineral", "mineral", "mineral", "organic", "organic",
"organic", "organic", "mineral", "mineral", "organic", "organic",
"mineral", "mineral", "mineral", "mineral", "organic", "organic",
"organic", "organic", "mineral", "mineral", "mineral", "mineral",
"organic", "organic", "organic", "organic", "mineral", "mineral",
"organic", "organic", "mineral", "mineral"), compound = c("Acetone",
"Acetaldehyde", "Acetone", "Acetaldehyde", "Acetone", "Acetaldehyde",
"Acetone", "Acetaldehyde", "Acetone", "Acetaldehyde", "Acetone",
"Acetaldehyde", "Acetone", "Acetaldehyde", "Acetone", "Acetaldehyde",
"Acetone", "Acetaldehyde", "Acetone", "Acetaldehyde", "Acetone",
"Acetaldehyde", "Acetone", "Acetaldehyde", "Acetone", "Acetaldehyde",
"Acetone", "Acetaldehyde", "Acetone", "Acetaldehyde", "Acetone",
"Acetaldehyde", "Acetone", "Acetaldehyde", "Acetone", "Acetaldehyde"
), emission = c(0.01, 0, 0.03, 0.03, 0.07, 0.06, 0.33, 0.1, 0.02,
0.01, 0.01, 0, 0.02, 0.01, 0.07, 0.08, 0.09, 0.07, 0.32, 0.22,
0.01, 0, 0.06, 0.06, 0.08, 0.06, 0.23, 0.14, 0.4, 0.04, 0.14,
0, 0.05, 0.05, 0.14, 0)), row.names = c(NA, -36L), class = c("tbl_df",
"tbl", "data.frame"))
This only addresses the setup of the data, not the plotting. (sorry for the partial answer!)
You wrote that you wanted to group by soil, compound, days, did you mean soil_type, compound, days? As #maarvd pointed out, with soil, every row is unique.
When I modified the content to
df %>%
group_by(soil_type, compound, days)%>%
mutate(cum_emission=cumsum(emission))%>%
summarise(mean=mean(cum_emission, na.rm = TRUE),
sd = sd(cum_emission, na.rm = TRUE),
se = std.error(cum_emission, na.rm = TRUE))
I was able to render the following results
# A tibble: 12 x 6
# Groups: soil_type, compound [4]
soil_type compound days mean sd se
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 mineral Acetaldehyde 0 0.0700 0.0346 0.02
2 mineral Acetaldehyde 4 0.127 0.0404 0.0233
3 mineral Acetaldehyde 10 0.10 0.0346 0.02
4 mineral Acetone 0 0.08 0.0436 0.0252
5 mineral Acetone 4 0.177 0.116 0.0669
6 mineral Acetone 10 0.16 0.111 0.0643
7 organic Acetaldehyde 0 0.07 0.0608 0.0351
8 organic Acetaldehyde 4 0.173 0.144 0.0829
9 organic Acetaldehyde 10 0.107 0.0945 0.0546
10 organic Acetone 0 0.237 0.197 0.113
11 organic Acetone 4 0.25 0.201 0.116
12 organic Acetone 10 0.297 0.319 0.184
** changes based on #Tiptop's comment
If you're looking for the cumulative, moving averages, how about this?
I'm sure some of this I didn't originally write, but wherever it originated, I've repurposed it many times.
You won't need plotrix, but you will need the library tidyquant.
library(tidyverse)
library(tidyquant)
UDF_roll <- function(x, na.rm = TRUE) {
m <- mean(x, na.rm = na.rm) # calculate the average (for the rolling average)
s <- sd(x, na.rm = na.rm) # calculate the sd to find the confidence interval
hi <- m + 2*s # CI HI
lo <- m - 2*s # CI Low
vals <- c(Mean = m, SD = s, HI.95 = hi, LO.95 = lo)
return(vals)
}
# loop for each type of compound (I'm assuming that the data you provided is a sample and you have more.)
trends <- vector("list") # empty list to store the results
cp = unique(df$compound) # create a list of unique compound names
for(i in 1:length(unique(df$compound))){ # loop through each compound
trends[[i]] <- df %>% as.data.frame() %>% # add results to the list
filter(compound == cp[i]) %>% # for one compound
arrange(days) %>%
# the rolling functions requires time series with a date; so random dates added as controller
mutate(time = seq(as.Date("2010/1/1"),
by = "month",
length.out = nrow(.)),
cum_emission = cumsum(emission)) %>%
arrange(compound,-days) %>% # most recent on top for TS
tq_mutate(select = cum_emission, # collect mean, sd, error
mutate_fun = rollapply,
width = 2, # 2: current & previous reading
align = "right",
by.column = FALSE,
FUN = UDF_roll, # calls the function UDF
na.rm = TRUE) %>%
ggplot(aes(x = seq_along(time))) +
geom_point(aes(y = cum_emission),
color = "black", alpha = 0.2) + # cumulative
geom_ribbon(aes(ymin = LO.95, ymax = HI.95),
fill = "azure3", alpha = 0.4) + # confidence interval
geom_jitter(aes(y = Mean, color= Mean),
size = 1, alpha = 0.9) + # rolling average
labs(title = paste0(cp[[i]], ": Trends and Volatility\nIncremental Moving Average with 95% CI Bands (+/-2 SD)"),
x = "", y = "Soil Emissions") +
scale_color_viridis_c(end = .8) + theme_bw() +
theme(legend.position="none")
}
trends[[1]]
trends[[2]]
trends[[1]]$data # you can NULL the time column if you use the data another way
This makes the data time series. The plots:
The data is shown below. If you wanted to group it differently, you'll have to add the argument .groups = "drop" to the summarise() call, or you won't be able to get it through tq_mutate.
# A tibble: 18 x 11
days soil soil_type compound emission time cum_emission Mean SD HI.95 LO.95
<dbl> <dbl> <chr> <chr> <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 12 organic Acetone 0.01 2010-01-01 0.01 NA NA NA NA
2 0 2 mineral Acetone 0.03 2010-02-01 0.04 0.025 0.0212 0.0674 -0.0174
3 0 1 mineral Acetone 0.07 2010-03-01 0.11 0.075 0.0495 0.174 -0.0240
4 0 9 organic Acetone 0.33 2010-04-01 0.44 0.275 0.233 0.742 -0.192
5 0 13 organic Acetone 0.02 2010-05-01 0.46 0.45 0.0141 0.478 0.422
6 0 3 mineral Acetone 0.01 2010-06-01 0.47 0.465 0.00707 0.479 0.451
7 4 12 organic Acetone 0.02 2010-07-01 0.49 0.48 0.0141 0.508 0.452
8 4 2 mineral Acetone 0.07 2010-08-01 0.56 0.525 0.0495 0.624 0.426
9 4 1 mineral Acetone 0.09 2010-09-01 0.65 0.605 0.0636 0.732 0.478
10 4 9 organic Acetone 0.32 2010-10-01 0.97 0.81 0.226 1.26 0.357
11 4 13 organic Acetone 0.05 2010-11-01 1.02 0.995 0.0354 1.07 0.924
12 4 3 mineral Acetone 0.14 2010-12-01 1.16 1.09 0.0990 1.29 0.892
13 10 12 organic Acetone 0.01 2011-01-01 1.17 1.16 0.00707 1.18 1.15
14 10 2 mineral Acetone 0.06 2011-02-01 1.23 1.2 0.0424 1.28 1.12
15 10 1 mineral Acetone 0.08 2011-03-01 1.31 1.27 0.0566 1.38 1.16
16 10 9 organic Acetone 0.23 2011-04-01 1.54 1.42 0.163 1.75 1.10
17 10 13 organic Acetone 0.4 2011-05-01 1.94 1.74 0.283 2.31 1.17
18 10 3 mineral Acetone 0.14 2011-06-01 2.08 2.01 0.0990 2.21 1.81

Modify columns' values depending on other columns

My dataset contains NDVI values and NDVI-QualityDescriptor values(PixelQa) for different areas in different dates. I basically want to erase (setting to NA) the NDVI values that are related to bad quality descriptor (PixelQa). The number suffix of the column names relates both data: PixelQa_1 is related to NDVI_1 and so on.
Therefore to "clean" my data I have to check PixelQa values in order to assess if I have to change its related NDVI value. There is 3 possible situations:
PixelQa is NA -> then NDVI should be also NA.
Pixel Qa is 66±0.5 OR 130±0.5 -> then NDVI remains the same value.
Pixel Qa is different to 66±0.5 OR 130±0.5 -> then NDVI value is set to NA (this is bad quality data which needs to be ignored).
My dataset could be:
DataNDVI_split <- data.frame("21feb1987_NDVI" = c(0.123, NA, 0.192, 0.234, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), "18jul1987_NDVI" = c(0.223, NA, 0.230, 0.334, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), stringsAsFactors = FALSE)
DataNDVI_split
X21feb1987_NDVI1 X21feb1987_PixelQa1 X18jul1987_NDVI2 X21feb1987_PixelQa2
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 0.234 79.87 0.334 79.87
5 NA NA NA NA
And "clean" it should look like:
DataNDVI_split <- data.frame("21feb1987_NDVI" = c(0.123, NA, 0.192, 0.234, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), "18jul1987_NDVI" = c(0.223, NA, 0.230, 0.334, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), stringsAsFactors = FALSE)
DataNDVI_split
X21feb1987_NDVI1 X21feb1987_PixelQa1 X18jul1987_NDVI2 X21feb1987_PixelQa2
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 NA 79.87 NA 79.87
5 NA NA NA NA
Here's a tentative solution.
First, I'd split up the data into two separate dataframes, thus:
df_ndvi <- DataNDVI[grepl("NDVI", DataNDVI$Data), ]
df_ndvi
Data X21feb1987 X18jul1987
1 NDVI1 0.123 0.223
2 NDVI2 NA NA
3 NDVI3 0.192 0.230
4 NDVI4 0.234 0.334
5 NDVI5 NA NA
df_pixel <- DataNDVI[!grepl("NDVI", DataNDVI$Data), ]
df_pixel
Data X21feb1987 X18jul1987
6 PixelQa1 66.30 66.00
7 PixelQa2 NA NA
8 PixelQa3 66.00 124.23
9 PixelQa4 79.87 86.00
10 PixelQa5 NA NA
To perform the desired changes, there are many possible ways. One way is by using a forloop through all the columns in df_ndvi (except the first!) and defining an ifelse statement to see whether or not the conditions hold true and to define actions to be taken in either case:
for(i in 2:3){
df_ndvi[,i] <- ifelse(df_pixel[,i] < 65.5 | df_pixel[,i] > 66.5, NA, df_ndvi[,i])
}
This results in these corrections in df_ndvi:
df_ndvi
Data X21feb1987 X18jul1987
1 NDVI1 0.123 0.223
2 NDVI2 NA NA
3 NDVI3 0.192 NA
4 NDVI4 NA NA
5 NDVI5 NA NA
EDIT:
If you prefer to split-up the data in this way:
DataNDVI_split <- data.frame("21feb1987_NDVI" = c(0.123, NA, 0.192, 0.234, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), "18jul1987_NDVI" = c(0.223, NA, 0.230, 0.334, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), stringsAsFactors = FALSE)
DataNDVI_split
X21feb1987_NDVI X21feb1987_PixelQa X18jul1987_NDVI X21feb1987_PixelQa.1
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 0.234 79.87 0.334 79.87
5 NA NA NA NA
then the for loop could be adapted thus:
for(i in c(1,3)){
DataNDVI_split[,i] <- ifelse(DataNDVI_split[,i+1] < 65.5 | DataNDVI_split[,i+1] > 66.5, NA, DataNDVI_split[,i])
}
The result is this:
DataNDVI_split
X21feb1987_NDVI X21feb1987_PixelQa X18jul1987_NDVI X21feb1987_PixelQa.1
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 NA 79.87 NA 79.87
5 NA NA NA NA

R - Matching columns in dataframe

I have this NDVI timeseries dataset where the first column is dates and the next three are NDVI data for three different IDs (59231, 158157, 282302)
Date X59231 X158157 X282302
1 13149 NA 0.398 NA
2 13157 0.344 0.267 0.327
3 13165 NA 0.431 NA
. ..... ..... ..... .....
Here's the dput:
structure(list(Date = c(13149L, 13157L, 13165L, 13173L, 13181L,
13189L, 13197L, 13205L, 13213L, 13221L, 13229L, 13237L, 13245L,
13253L, 13261L, 13269L, 13277L, 13285L, 13293L, 13301L, 13309L,
13317L, 13325L, 13333L, 13341L, 13349L, 13357L, 13365L, 13373L,
13381L, 13389L, 13397L, 13405L, 13413L, 13421L, 13429L, 13437L,
13445L, 13453L, 13461L, 13469L, 13477L, 13485L, 13493L, 13501L,
13509L), X59231 = c(NA, 0.344, NA, 0.398, NA, 0.587, NA, NA,
0.451, 0.597, 0.593, 0.556, 0.559, 0.375, 0.374, 0.386, 0.425,
0.383, 0.349, 0.315, 0.282, 0.323, 0.315, 0.359, 0.292, 0.271,
0.297, 0.307, 0.322, 0.344, 0.297, 0.285, 0.273, 0.282, 0.281,
0.304, 0.314, NA, 0.391, 0.601, 0.65, NA, 0.653, 0.666, 0.519,
0.625), X158157 = c(0.398, 0.267, 0.431, NA, 0.36, 0.434, 0.434,
0.465, 0.447, 0.521, 0.539, 0.563, 0.595, 0.541, 0.553, 0.381,
0.533, 0.505, 0.551, NA, 0.546, 0.535, 0.523, 0.501, 0.508, 0.51,
0.506, 0.51, 0.514, 0.526, 0.555, 0.545, 0.53, 0.539, 0.531,
0.53, NA, 0.585, 0.597, 0.32, 0.569, 0.601, NA, NA, 0.52, 0.532
), X282302 = c(NA, 0.327, NA, 0.282, 0.26, 0.293, 0.25, 0.288,
0.336, 0.299, 0.29, 0.28, NA, 0.305, 0.319, NA, 0.255, 0.292,
0.294, NA, NA, 0.367, 0.331, 0.344, 0.283, 0.284, 0.291, 0.273,
0.239, 0.285, 0.249, 0.285, 0.247, 0.288, 0.276, NA, 0.317, 0.375,
0.38, 0.417, 0.374, 0.491, NA, NA, NA, 0.471)), class = "data.frame", row.names = c(NA,
-46L))
I run the following code to smooth the timeseries (get rid of noise) and find the multiple maxs and mins for each ID's NDVI timeseries.
rm(list=ls())
#Read in csv data
df=read.csv("Data.csv", header = TRUE)
date_col = df[,1]
num_cols = length(df[1,]) #count number of columns there are
num_Dcols = num_cols-1 #count the number of columns there are minus the index (first) column
#Function to append columns to a dataframe
cbind.fill <- function(...){
nm <- list(...)
nm <- lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
#Create an empty data frame
finalDF = data.frame(matrix(ncol=(0),nrow=0)) #create empty dataframe
#Create an empty vector for column names
CNames = c()
for (i in c(1:num_Dcols)){
df_sub = df[,c(1,i+1)] #create a data frame of the date column and the i+1 column
df_removeNA = na.omit(df_sub)
#Append the date column to the final data frame
df_date = df_removeNA[,1]
finalDF = cbind.fill(finalDF, df_date)
#Append the NDVI timeseries column to the final data frame
df_data = df_removeNA[,2]
finalDF = cbind.fill(finalDF, df_data)
stl_1=stl(ts(df_data, frequency=4), "periodic")
#Function to calculate all the maximums
ts_max<-function(signal)
{
points_max=which(diff(sign(diff(signal)))==-2)+1
return(points_max)
}
#Function to calculate all the minimums
ts_min<-function(signal)
{
points_min=which(diff(sign(diff(-signal)))==-2)+1
return(points_min)
}
#Smooth the timeseries
trend_1=as.numeric(stl_1$time.series[,2])
#Find max and mins of the smoothed timeseries
max_1=ts_max(trend_1)
min_1=ts_min(trend_1)
#Append max and mins to the final data frame
finalDF = cbind.fill(finalDF, df_data[max_1])
finalDF = cbind.fill(finalDF, df_data[min_1])
#Append column names to the column names vector
CNames = c(CNames, toString(colnames(df_sub[1])))
CNames = c(CNames, toString(colnames(df_sub[2])))
CNames = c(CNames, paste(c(toString(colnames(df_sub[2])), "_Max"), collapse=''))
CNames = c(CNames, paste(c(toString(colnames(df_sub[2])), "_Min"), collapse=''))
#Plot final results
plot(df_date, trend_1, type = 'l')
abline(v=df_date[max_1], col="red")
abline(v=df_date[min_1], col="blue")
}
#Rename final data frame's column names
colnames(finalDF) = CNames
#Export final data frame to CSV
write.csv(finalDF, file = "finalDF_smooth.csv")
Here's an image of all the maxs and mins for the first column of NDVI timeseries data.
What I'm trying to figure out is how to add two new columns into the original (or new) data frame next to each ID column where I can store the maximums and minimums. The maximums and minimums need to be placed in the cell that matches its corresponding date. In other words, I need two duplicated columns of each ID column. Inserted next to each ID column with all values replaced with NA except the maximums and minimums. Both of which were calculated in the smoothing code above. For example, this is what I need the final dataframe to look like :
Date 59231 59231_Max 59231_Min 158157 158157_Max 158157_Min 282302 282302_Max 282302_Min
13149 NA NA NA 0.398 NA NA NA NA NA
13157 0.344 NA NA 0.267 NA NA 0.327 NA NA
13165 NA NA NA 0.431 NA NA NA NA NA
13173 0.398 NA NA NA NA NA 0.282 NA NA
13181 NA NA NA 0.360 NA NA 0.260 NA NA
13189 0.587 NA NA 0.434 NA NA 0.293 NA 0.293
13197 NA NA NA 0.434 NA NA 0.25 NA NA
13205 NA NA NA 0.465 NA NA 0.288 NA NA
13213 0.451 NA NA 0.447 NA NA 0.336 NA NA
13221 0.597 NA NA 0.521 NA NA 0.299 0.299 NA
... ... .. .. ... .. .. ... ... ..
This is what it looks like right now.
Date 59231 59231_Max 59231_Min Date 158157 158157_Max 158157_Min Date 282302 282302_Max 282302_Min
13157 0.344 0.593 0.386 13149 0.398 0.595 0.533 13157 0.327 0.299 0.293
13173 0.398 0.425 0.282 13157 0.267 0.546 0.508 13173 0.282 0.331 0.255
13189 0.587 0.315 0.297 13165 0.431 0.545 0.539 13181 0.260 NA 0.285
13213 0.451 0.322 0.273 13181 0.360 0.530 0.320 13189 0.293 NA NA
13221 0.597 0.653 NA 13189 0.434 NA NA 13197 0.250 NA NA
13229 0.593 NA NA 13197 0.434 NA NA 13205 0.288 NA NA
13237 0.556 NA NA 13205 0.465 NA NA 13213 0.336 NA NA
13245 0.559 NA NA 13213 0.447 NA NA 13221 0.299 NA NA
13253 0.375 NA NA 13221 0.521 NA NA 13229 0.290 NA NA
13261 0.374 NA NA 13229 0.539 NA NA 13237 0.280 NA NA
..... ... .. .. ..... ..... .. .. ..... ..... ... ..
Note: I had to omit NAs during each loop so the code produces a CSV file with a unique subset date column for each ID. I would love to just have one date column like the ideal table above.
In my code I started to create a new data frame and appending each column after each loop but I can't figure out how to match up the maxs and mins in the right cells. Right now all the max and mins are stacked at the top of their columns. Any ideas? Thanks.
How about this? It adds the min and max columns.
df
df$max <- apply(df[2:4], 1, max, na.rm = TRUE)
df$min <- apply(df[2:4], 1, min, na.rm = TRUE)
head(df)
Which produces:
ID X59231 X158157 X282302 max min
1 13149 NA 0.398 NA 0.398 0.398
2 13157 0.344 0.267 0.327 0.344 0.267
3 13165 NA 0.431 NA 0.431 0.431
4 13173 0.398 NA 0.282 0.398 0.282
5 13181 NA 0.360 0.260 0.360 0.260
6 13189 0.587 0.434 0.293 0.587 0.293
I have added this based on the clarification that you have provided. You can ignore the bit above:
This will produce what you want. I have only done it for the first column, but you can just change the variables to get the other columns.
library(dplyr)
df2 <- as_tibble(df)
df2 <- df2 %>%
mutate(X59231_min = min(X59231, na.rm = TRUE))%>%
mutate(X59231_min = ifelse(X59231 == X59231_min, X59231_min, NA)) %>%
mutate(X59231_max = max(X59231, na.rm = TRUE))%>%
mutate(X59231_max = ifelse(X59231 == X59231_max, X59231_max, NA))
So:
df2 %>% filter(!is.na(X59231_min))
gives us:
# A tibble: 1 x 6
ID X59231 X158157 X282302 X59231_min X59231_max
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 13349 0.271 0.51 0.284 0.271 NA
And:
df2 %>% filter(!is.na(X59231_max))
Shows:
# A tibble: 1 x 6
ID X59231 X158157 X282302 X59231_min X59231_max
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 13493 0.666 NA NA NA 0.666
You should be able to do it for the other columns.

How to count row in every column based on threshold

I have the following data set:
dat <- structure(list(Probes = structure(1:6, .Label = c("1415670_at",
"1415671_at", "1415672_at", "1415673_at", "1415674_a_at", "1415675_at"
), class = "factor"), Genes = structure(c(2L, 1L, 4L, 5L, 6L,
3L), .Label = c("Atp6v0d1", "Copg1", "Dpm2", "Golga7", "Psph",
"Trappc4"), class = "factor"), bCD.ID.LN = c(1.133, 1.068, 1.01,
0.943, 1.048, 1.053), bCD.ID.LV = c(1.049, 1.006, 0.883, 0.799,
0.96, 1.104), bCD.ID.SP = c(1.124, 1.234, 1.029, 1.064, 1.118,
1.057), bCD.IP.LV = c(1.013, 1.082, 1.061, 0.982, 1.191, 1.053
), bCD.IP.SP = c(0.986, 1.102, 1.085, 0.997, 1.141, 1.041)), .Names = c("Probes",
"Genes", "bCD.ID.LN", "bCD.ID.LV", "bCD.ID.SP", "bCD.IP.LV",
"bCD.IP.SP"), row.names = c(NA, 6L), class = "data.frame")
It looks like this:
> dat
Probes Genes bCD.ID.LN bCD.ID.LV bCD.ID.SP bCD.IP.LV bCD.IP.SP
1 1415670_at Copg1 1.133 1.049 1.124 1.013 0.986
2 1415671_at Atp6v0d1 1.068 1.006 1.234 1.082 1.102
3 1415672_at Golga7 1.010 0.883 1.029 1.061 1.085
4 1415673_at Psph 0.943 0.799 1.064 0.982 0.997
5 1415674_a_at Trappc4 1.048 0.960 1.118 1.191 1.141
6 1415675_at Dpm2 1.053 1.104 1.057 1.053 1.041
What I want tod for 3rd column onward count row where the value is > 1.1
So it the end it looks like this:
bCD.ID.LN 1
bCD.ID.LV 1
bCD.ID.SP 3
bCD.IP.LV 1
bCD.IP.SP 2
How can I do that?
We can try colSums on a logical matrix based on the numeric columns in the dataset.
Count <- colSums(dat[-(1:2)] > 1.1, na.rm=TRUE)
If we need it as a data.frame
d1 <- data.frame(Cnames = names(Count), Count=unname(Count))
If it is a large dataset, converting to a logical matrix may not be memory efficient, in that case, it would be better to loop using vapply
vapply(dat[-(1:2)], function(x) sum(x > 1.1, na.rm=TRUE), 0)
Yet another version, this time using dplyr
dat %>%
select(-c(Probes, Genes)) %>%
summarise_each (funs(sum((. > 1.1))))
Here's an alternative version using lapply()
lapply(dat[-c(1:2)], function(x) length(which(x > 1.1)))
or this if you want it as a data.frame()
data.frame( lapply(dat[-c(1:2)], function(x) length(which(x > 1.1))))

Resources