My dataframe looks like this
ID t1 obs1 t2 obs2 t3 obs3
1 0 a 11 d 0 g
2 0 b 13 e 11 i
3 0 c 0 f 0 h
I need to make sure each ID has at least one t above 10 (delete row if not). Then, I want to save the lowest t value above 10, but also save the corresponding obs in new columns. (The complicated part about my question is that the lowest t above 10 could be in any column). The corresponding obs to some t is located in the next column, so that helps. So my resulting data frame would look like this:
ID t1 obs1 t2 obs2 t3 obs3 lowesttabove10 correspondingobs
1 0 a 11 d 0 g 11 d
2 0 b 13 e 11 i 11 i
With data.table, go to long format:
library(data.table)
setDT(DT)
dat = melt(DT, measure.vars = patterns("^t\\d+$", "^obs\\d+$"), value.name = c("t", "obs"))
setorder(dat, ID, variable)
# ID variable t obs
# 1: 1 1 0 a
# 2: 1 2 11 d
# 3: 1 3 0 g
# 4: 2 1 0 b
# 5: 2 2 13 e
# 6: 2 3 11 i
# 7: 3 1 0 c
# 8: 3 2 0 f
# 9: 3 3 0 h
Find max value per group and mark groups to keep:
IDDT = dat[order(-t),
.(max.variable = first(variable), max.t = first(t), max.obs = first(obs))
, by=ID]
IDDT[, keep := max.t > 10]
# ID max.variable max.t max.obs keep
# 1: 2 2 13 e TRUE
# 2: 1 2 11 d TRUE
# 3: 3 1 0 c FALSE
Find min value over 10 per kept group using a rolling update join:
IDDT[(keep), c("my.variable", "my.t", "my.obs") := {
m = .(ID = ID, t_thresh = 10)
dat[m, on=.(ID, t = t_thresh), roll=-Inf, .(x.variable, x.t, x.obs)]
}]
# ID max.variable max.t max.obs keep my.variable my.t my.obs
# 1: 2 2 13 e TRUE 3 11 i
# 2: 1 2 11 d TRUE 2 11 d
# 3: 3 1 0 c FALSE NA NA NA
I would stop here, with the main data in long format dat and the ID level variables in the separate table IDDT. To filter dat to groups that should be kept: dat[IDDT[(keep), .(ID)], on=.(ID)]. See ?data.table and the other intro materials mentioned when you load the package for details on the syntax.
See ?dcast if you insist on going back to wide.
Using base R:
Drop all rows with no t-values above 10:
df1 <- df1[rowSums(df1[, grepl("^t", colnames(df1))] >10) > 0, ]
Determine the group that contains the lowest value above 10 and then retrieve values:
df1$group <- apply(df1[grepl("^t", names(df1))], 1, function(x) which(x == min(x[x > 10])))
df1 <- cbind(df1, do.call(rbind, lapply(seq_len(nrow(df1)),
function(x) setNames(df1[x, paste0(c("t", "obs"), df1$group[x])],
c("lowesttabove10", "correspondingobs")))))
> df1
ID t1 obs1 t2 obs2 t3 obs3 group lowesttabove10 correspondingobs
1 1 0 a 11 d 0 g 2 11 d
2 2 0 b 13 e 11 i 3 11 i
My approach is not neat , but still works, You can try it.
library(dplyr)
library(reshape)
df1=melt(df,id='ID')
df2=df1%>%group_by(ID)%>%filter(value>10)%>%dplyr::slice(which.min(value))%>%na.omit()
> df2
# A tibble: 2 x 3
# Groups: ID [2]
ID variable value
<int> <fctr> <chr>
1 1 t2 11
2 2 t3 11
df2$variable=as.character(df2$variable)
C=as.numeric(gsub("[[:alpha:]]", "", df2$variable))
df=df[df$ID%in%df2$ID,]
for (i in 1:length(C)){
DF1=df[i,str_detect(names(df),as.character(C[i]))]
names(DF1)=c('lowesttabove10 ','correspondingobs')
if (i ==1 ){DFF=DF1}else{DFF=rbind(DFF,DF1)}
}
cbind(df,DFF)
ID t1 obs1 t2 obs2 t3 obs3 lowesttabove10 correspondingobs
1 1 0 a 11 d 0 g 11 d
2 2 0 b 13 e 11 i 11 i
Solution uses dplyr and tidyr in one pipeline. dt is the original data, while dt2 is the final output.
library(dplyr)
library(tidyr)
dt2 <- dt %>%
gather(t_group, t_value, starts_with("t")) %>%
gather(obs_group, obs_value, starts_with("obs")) %>%
filter(gsub("t", "", t_group) == gsub("obs", "", obs_group)) %>%
filter(t_value >= 10) %>%
filter(t_value == min(t_value)) %>%
select(ID, lowesttabove10 = t_value, correspondingobs = obs_value) %>%
inner_join(dt, by = "ID") %>%
select(colnames(dt), lowesttabove10, correspondingobs)
df2
ID t1 obs1 t2 obs2 t3 obs3 lowesttabove10 correspondingobs
1 1 0 a 11 d 0 g 11 d
2 2 0 b 13 e 11 i 11 i
Data:
dt <- read.table(text = "ID t1 obs1 t2 obs2 t3 obs3
1 0 a 11 d 0 g
2 0 b 13 e 11 i
3 0 c 0 f 0 h",
header = TRUE, stringsAsFactors = FALSE)
Related
group_ID <- c("a","a","a","a","a","b","b","b","b","b","b","b","b")
class <- c("p","q","q","q","q","p","p","p","q","q","q","q","q")
var1 <- c(3,1,1,1,1,3,2,1,1,2,2,4,1)
my_table <- data.frame(group_ID,class,var1)
I have the following table.
group_ID class var1
a p 3
a q 1
a q 1
a q 1
a q 1
b p 3
b p 2
b p 1
b q 1
b q 2
b q 2
b q 4
b q 1
I want to create a new column by dividing the median of var1 of class p by the median var1 of the q class for each group. The expected output is shown below.
group_ID class var1 var1_ratio
a p 3 3
a q 1 3
a q 1 3
a q 1 3
a q 1 3
b p 3 1
b p 2 1
b p 1 1
b q 1 1
b q 2 1
b q 2 1
b q 4 1
b q 1 1
Link: This question seems to be the most similar to mine and I tried using group_by() and mutate_each() as below but I can't get it to work.
my_table <- my_table %>%
group_by(group_ID,class) %>%
mutate_each(funs(./median(.[class == "p"])), var1)
Also I tried : Link1 Link2 Link3
Thanks!
We don't need mutate_each
library(dplyr)
my_table %>%
# // grouped by group_ID, class
group_by(group_ID, class) %>%
# // create a median column
mutate(Median= median(var1)) %>%
# // reset the grouping by removing class
group_by(group_ID) %>%
# // divide the first element of subset of Median for each class
mutate(var1_ratio = first(Median[class == 'p'])/first(Median[class == 'q']),
Median = NULL)
# A tibble: 13 x 4
# Groups: group_ID [2]
# group_ID class var1 var1_ratio
# <chr> <chr> <dbl> <dbl>
# 1 a p 3 3
# 2 a q 1 3
# 3 a q 1 3
# 4 a q 1 3
# 5 a q 1 3
# 6 b p 3 1
# 7 b p 2 1
# 8 b p 1 1
# 9 b q 1 1
#10 b q 2 1
#11 b q 2 1
#12 b q 4 1
#13 b q 1 1
Here is a base R solution. It uses aggregate twice, once to compute the medians and then the ratios. Then, it merges with the original to put the new column values in their place.
agg <- aggregate(var1 ~ ., my_table, median)
agg <- aggregate(var1 ~ group_ID, agg, function(x) x[1]/x[2])
names(agg)[2] <- "var1_ratio"
merge(my_table, agg)
You can also try creating summaries and the joining to original data:
library(tidyverse)
my_table %>% left_join(my_table %>%
group_by(group_ID,class) %>%
summarise(Median=median(var1)) %>%
pivot_wider(names_from = class,values_from = Median,
names_prefix = 'Median.')) %>%
mutate(Ratio=Median.p/Median.q) %>% select(-c(Median.p,Median.q))
Output:
group_ID class var1 Ratio
1 a p 3 3
2 a q 1 3
3 a q 1 3
4 a q 1 3
5 a q 1 3
6 b p 3 1
7 b p 2 1
8 b p 1 1
9 b q 1 1
10 b q 2 1
11 b q 2 1
12 b q 4 1
13 b q 1 1
I have a dataset looks like below in R: Found similar posts like this Counting number of times a value occurs but not exactly the same.
id <- c(1,1,1, 2,2,2, 3,3,3,3)
cat.1 <- c("a","a","a","b","b","b","c","c","c","c")
cat.2 <- c("m","m","m","f","f","f","m","m","m","m")
score <- c(-1,0,-1, 1,0,1, -1,0,1,1)
data <- data.frame("id"=id, "cat.1"=cat.1, "cat.2"=cat.2, "score"=score)
data
id cat.1 cat.2 score
1 1 a m -1
2 1 a m 0
3 1 a m -1
4 2 b f 1
5 2 b f 0
6 2 b f 1
7 3 c m -1
8 3 c m 0
9 3 c m 1
10 3 c m 1
I would like to count number of -1 values in the score variable within each id. Also, I would like to keep the cat.1 and cat.2 variables. Desired output would be:
id cat.1 cat.2 count(-1)
1 1 a m 2
2 2 b f 0
3 3 c m 1
Do you have any suggestions?
Thanks!
This is something we can use dplyr for:
data %>%
group_by(id, cat.1, cat.2) %>% # or: group_by_at(vars(-score))
summarise(count_neg_1 = sum(score == -1))
# id cat.1 cat.2 count_neg_1
# 1 1 a m 2
# 2 2 b f 0
# 3 3 c m 1
You can change the name of the calculated column if you so desire. I generally avoid anything other than a letter, number, or underscore in my variable names.
One base R possibility could be:
aggregate(score ~ ., FUN = function(x) sum(x == -1), data = data)
id cat.1 cat.2 score
1 2 b f 0
2 1 a m 2
3 3 c m 1
If you have more variables in your data and you want to group with just these three, then you can explicitly specify it by aggregate(score ~ id + cat.1 + cat.2, ...)
library(data.table)
setDT(data)[ , sum(score == -1), by=c('id', 'cat.1', 'cat.2')]
# id cat.1 cat.2 V1
# 1: 1 a m 2
# 2: 2 b f 0
# 3: 3 c m 1
Another option is count
library(dplyr)
data %>%
mutate(score = score == -1) %>%
dplyr::count(id, cat.1, cat.2, wt = score)
# A tibble: 3 x 4
# id cat.1 cat.2 n
# <dbl> <fct> <fct> <int>
#1 1 a m 2
#2 2 b f 0
#3 3 c m 1
How can I create a new data frame with the smallest values group by a column.
For example this df:
df <- read.table(header = TRUE, text = 'Gene Value
A 12
A 10
B 3
B 0
B 6
C 1
D 0
D 4')
Now with:
test <- setDT(df)[, .SD[which.min(Value)], by=Gene]
I get this:
> test
Gene Value
1: A 10
2: B 0
3: C 1
4: D 0
But how can I use a second condition for Value > 0 here? I want to have this output:
> test
Gene Value
1: A 10
2: B 3
3: C 1
4: D 4
Could do:
setDT(df)[, .(Value = min(Value[Value > 0])), by=Gene]
Output:
Gene Value
1: A 10
2: B 3
3: C 1
4: D 4
Using tidyverse you can group, filter and then summarize the min value:
library(tidyverse)
df2 <- df %>%
group_by(Gene) %>%
filter(Value != 0) %>%
summarise(Value = min(Value))
# A tibble: 4 x 2
Gene Value
<fct> <dbl>
1 A 10
2 B 3
3 C 1
4 D 4
Using aggregate from base R
aggregate(Value ~ Gene, subset(df, Value > 0), min)
# Gene Value
#1 A 10
#2 B 3
#3 C 1
#4 D 4
Consider the following two tibbles:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value = 1:6)
So a and b have the same columns and b has an additional column called id.
I want to do the following: group b by id and then add tibble a on top of each group.
So the output should look like this:
# A tibble: 10 x 3
id time value
<chr> <int> <int>
1 a -1 100
2 a 0 200
3 a 1 1
4 a 2 2
5 a 3 3
6 b -1 100
7 b 0 200
8 b 1 4
9 b 2 5
10 b 3 6
Of course there are multiple workarounds to achieve this (like loops for example). But in my case I have a large number of IDs and a very large number of columns.
I would be thankful if anyone could point me towards the direction of a solution within the tidyverse.
Thank you
We can expand the data frame a with id from b and then bind_rows them together.
library(tidyverse)
a2 <- expand(a, id = b$id, nesting(time, value))
b2 <- bind_rows(a2, b) %>% arrange(id, time)
b2
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
split from base R will divide a data frame into a list of subsets based on an index.
b %>%
split(b[["id"]]) %>%
lapply(bind_rows, a) %>%
lapply(select, -"id") %>%
bind_rows(.id = "id")
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a 1 1
# 2 a 2 2
# 3 a 3 3
# 4 a -1 100
# 5 a 0 200
# 6 b 1 4
# 7 b 2 5
# 8 b 3 6
# 9 b -1 100
# 10 b 0 200
An idea (via base R) is to split your data frame and create a new one with id + the other data frame and rbind, i.e.
df = do.call(rbind, lapply(split(b, b$id), function(i)rbind(data.frame(id = i$id[1], a), i)))
which gives
id time value
a.1 a -1 100
a.2 a 0 200
a.3 a 1 1
a.4 a 2 2
a.5 a 3 3
b.1 b -1 100
b.2 b 0 200
b.3 b 1 4
b.4 b 2 5
b.5 b 3 6
NOTE: You can remove the rownames by simply calling rownames(df) <- NULL
We can nest and add the relevant rows to each nested item :
library(tidyverse)
b %>%
nest(-id) %>%
mutate(data= map(data,~bind_rows(a,.x))) %>%
unnest
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
Maybe not the most efficient way, but easy to follow:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value =
1:6)
a.a <- a %>% add_column(id = rep("a",length(a)))
a.b <- a %>% add_column(id = rep("b",length(a)))
joint <- bind_rows(b,a.a,a.b)
(joint <- arrange(joint,id))
I need to process rows of a data-frame in order, but need to look-back for certain rows. Here is an approximate example:
library(dplyr)
d <- data_frame(trial = rep(c("A","a","b","B","x","y"),2))
d <- d %>%
mutate(cond = rep('', n()), num = as.integer(rep(0,n())))
for (i in 1:nrow(d)){
if(d$trial[i] == "A"){
d$num[i] <- 0
d$cond[i] <- "A"
}
else if(d$trial[i] == "B"){
d$num[i] <- 0
d$cond[i] <- "B"
}
else{
d$num[i] <- d$num[i-1] +1
d$cond[i] <- d$cond[i-1]
}
}
The resulting data-frame looks like
> d
Source: local data frame [12 x 3]
trial cond num
1 A A 0
2 a A 1
3 b A 2
4 B B 0
5 x B 1
6 y B 2
7 A A 0
8 a A 1
9 b A 2
10 B B 0
11 x B 1
12 y B 2
What is the proper way of doing this using dplyr?
dlpyr-only solution:
d %>%
group_by(i=cumsum(trial %in% c('A','B'))) %>%
mutate(cond=trial[1],num=seq(n())-1) %>%
ungroup() %>%
select(-i)
# trial cond num
# 1 A A 0
# 2 a A 1
# 3 b A 2
# 4 B B 0
# 5 x B 1
# 6 y B 2
# 7 A A 0
# 8 a A 1
# 9 b A 2
# 10 B B 0
# 11 x B 1
# 12 y B 2
Try
d %>%
mutate(cond = zoo::na.locf(ifelse(trial=="A"|trial=="B", trial, NA))) %>%
group_by(id=rep(1:length(rle(cond)$values), rle(cond)$lengths)) %>%
mutate(num = 0:(n()-1)) %>% ungroup %>%
select(-id)
Here is one way. The first thing was to add A or B in cond using ifelse. Then, I employed na.locf() from the zoo package in order to fill NA with A or B. I wanted to assign a temporary group ID before I took care of num. I borrowed rleid() in the data.table package. Grouping the data with the temporary group ID (i.e., foo), I used row_number() which is one of the window functions in the dplyr package. Note that I tried to remove foo doing select(-foo). But, the column wanted to stay. I think this is probably something to do with compatibility of the function.
library(zoo)
library(dplyr)
library(data.table)
d <- data_frame(trial = rep(c("A","a","b","B","x","y"),2))
mutate(d, cond = ifelse(trial == "A" | trial == "B", trial, NA),
cond = na.locf(cond),
foo = rleid(cond)) %>%
group_by(foo) %>%
mutate(num = row_number() - 1)
# trial cond foo num
#1 A A 1 0
#2 a A 1 1
#3 b A 1 2
#4 B B 2 0
#5 x B 2 1
#6 y B 2 2
#7 A A 3 0
#8 a A 3 1
#9 b A 3 2
#10 B B 4 0
#11 x B 4 1
#12 y B 4 2