The data set contains three variables: id, sex, and grade (factor).
mydata <- data.frame(id=c(1,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4), sex=c(1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1),
grade=c("a","b","c","d","e", "x","y","y","x", "q","q","q","q", "a", "a", "a", NA, "b"))
For each ID, I need to see how many unique grades we have and then create a new column (call N) to record the grade frequency. For instance, for ID=1, we have five unique values for "grade", so N = 4; for ID=2, we have two unique values for "grade", so N = 2; for ID=4, we have two unique values for "grade" (ignore NA), so N = 2.
The final data set is
mydata <- data.frame(id=c(1,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4), sex=c(1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1),
grade=c("a","b","c","d","e", "x","y","y","x", "q","q","q","q", "a", "a", "a", NA, "b"))
mydata$N <- c(5,5,5,5,5,2,2,2,2,1,1,1,1,2,2,2,2,2)
New answer:
The uniqueN-function of data.table has a na.rm argument, which we can use as follows:
library(data.table)
setDT(mydata)[, n := uniqueN(grade, na.rm = TRUE), by = id]
which gives:
> mydata
id sex grade n
1: 1 1 a 5
2: 1 1 b 5
3: 1 1 c 5
4: 1 1 d 5
5: 1 1 e 5
6: 2 0 x 2
7: 2 0 y 2
8: 2 0 y 2
9: 2 0 x 2
10: 3 0 q 1
11: 3 0 q 1
12: 3 0 q 1
13: 3 0 q 1
14: 4 1 a 2
15: 4 1 a 2
16: 4 1 a 2
17: 4 1 NA 2
18: 4 1 b 2
Old answer:
With data.table you could do this as follows:
library(data.table)
setDT(mydata)[, n := uniqueN(grade[!is.na(grade)]), by = id]
or:
setDT(mydata)[, n := uniqueN(na.omit(grade)), by = id]
You could use the package data.table:
library(data.table)
setDT(mydata)
#I have removed NA's, up to you how to count them
mydata[,N_u:=length(unique(grade[!is.na(grade)])),by=id]
Very short, readable and fast. It can also be done in base-R:
#lapply(split(grade,id),...: splits data into subsets by id
#unlist: creates one vector out of multiple vectors
#rep: makes sure each ID is repeated enough times
mydata$N <- unlist(lapply(split(mydata$grade,mydata$id),function(x){
rep(length(unique(x[!is.na(x)])),length(x))
}
))
Because there was discussion on what is faster, let's do some benchmarking.
Given dataset:
> test1
Unit: milliseconds
expr min lq mean median uq max neval cld
length_unique 3.043186 3.161732 3.422327 3.286436 3.477854 10.627030 100 b
uniqueN 2.481761 2.615190 2.763192 2.738354 2.872809 3.985393 100 a
Larger dataset: (10000 observations, 1000 id's)
> test2
Unit: milliseconds
expr min lq mean median uq max neval cld
length_unique 11.84123 24.47122 37.09234 30.34923 47.55632 97.63648 100 a
uniqueN 25.83680 50.70009 73.78757 62.33655 97.33934 210.97743 100 b
A dplyr option that makes use of dplyr::n_distinct and its na.rm-argument:
library(dplyr)
mydata %>% group_by(id) %>% mutate(N = n_distinct(grade, na.rm = TRUE))
#Source: local data frame [18 x 4]
#Groups: id [4]
#
# id sex grade N
# (dbl) (dbl) (fctr) (int)
#1 1 1 a 5
#2 1 1 b 5
#3 1 1 c 5
#4 1 1 d 5
#5 1 1 e 5
#6 2 0 x 2
#7 2 0 y 2
#8 2 0 y 2
#9 2 0 x 2
#10 3 0 q 1
#11 3 0 q 1
#12 3 0 q 1
#13 3 0 q 1
#14 4 1 a 2
#15 4 1 a 2
#16 4 1 a 2
#17 4 1 NA 2
#18 4 1 b 2
Looks like we have several votes for data.table, but you could also use the base R function ave():
mydata$N <- ave(as.character(mydata$grade),mydata$id,
FUN = function(x) length(unique(x[!is.na(x)])))
use tapply and lookup table
mydata <- data.frame(id=c(1,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4),
sex=c(1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1),
grade=c("a","b","c","d","e", "x","y","y","x", "q",
"q","q","q", "a", "a", "a", NA, "b"))
uniqN <- tapply(mydata$grade, mydata$id, function(x) sum(!is.na(unique(x))))
mydata$N <- uniqN[mydata$id]
Here is a dplyr method. I kept the summary table separate for tidy reasons.
library(dplyr)
summary =
mydata %>%
distinct(id, grade) %>%
filter(grade %>% is.na %>% `!`) %>%
count(id)
mydata %>%
left_join(summary)
Related
C1 C2
------
a 11
a 2
a 2
b 2
b 34
c 2
c 4
c 1
d 4
how can i get index of a groupname first occurence
for example: in column A first occurence of 'b' index is 4
like that i need to get all indexes of first occurence of group
With data.table package, you can get it with .I:
as.data.table(dtt)[, .(index = .I[1]), by = .(C1)]
# C1 index
# 1: a 1
# 2: b 4
# 3: c 6
# 4: d 9
If only indices are need:
which(!duplicated(dtt$C1))
[1] 1 4 6 9
library(dplyr)
df <- data.frame(C1 = c("a","a","a","b","b","c","c","c","d"),
C2 = c(11,2,2,2,34,2,4,1,4))
df %>%
mutate(r_number = row_number()) %>%
group_by(C1) %>%
summarise(index = min(r_number))
#> # A tibble: 4 x 2
#> C1 index
#> <chr> <int>
#> 1 a 1
#> 2 b 4
#> 3 c 6
#> 4 d 9
Try tapply + head like below
with(
df,
tapply(seq_along(C1), C1, head, 1)
)
which gives
a b c d
1 4 6 9
Or we can use aggregate
> aggregate(cbind(idx = seq_along(C1)) ~ C1, df, head, 1)
C1 idx
1 a 1
2 b 4
3 c 6
4 d 9
To add on the already present answers, with base R, using tapply:
dt$I <- 1:nrow(dt)
tapply(dt$I, dt$C1, function(x) x[1])
a b c d
1 4 6 9
If you want two column, the group and the index, with dplyr you could use cur_group_rows, the equivalent of .I in data.table, see https://dplyr.tidyverse.org/reference/context.html?q=grp#data-table
dt %>%
group_by(C1) %>%
summarise(cur_group_rows()[1])
# A tibble: 4 x 2
C1 index
<fct> <int>
1 a 1
2 b 4
3 c 6
4 d 9
A bit of comparison:
only the index
denis = function(){
tapply(dt$I, dt$C1, function(x) x[1])
}
mt1022 = function(){
which(!duplicated(dt$C1))
}
microbenchmark(mt1022(),denis())
Unit: microseconds
expr min lq mean median uq max neval cld
mt1022() 19.5 23.7 46.705 29.9 48.9 525.2 100 a
denis() 61.7 66.0 124.323 89.5 133.1 735.3 100 b
#mt1022 method is much faster
if you want the two column table:
library(dplyr)
library(data.table)
mt1022_datatable = function(){
as.data.table(dt)[, .(index = .I[1]), by = .(C1)]
}
jmpivette = function(){
dt %>%
mutate(r_number = row_number()) %>%
group_by(C1) %>%
summarise(r_number[1])
}
denis_dplyr = function(){
dt %>%
group_by(C1) %>%
summarise(index = cur_group_rows()[1])
}
microbenchmark(mt1022_datatable(),jmpivette(),denis_dplyr())
Unit: milliseconds
expr min lq mean median uq max neval cld
mt1022_datatable() 1.4469 1.72520 2.234030 2.01225 2.30720 8.9519 100 a
jmpivette() 6.6528 7.31915 10.029003 7.94435 8.89835 56.7763 100 c
denis_dplyr() 4.4943 4.92120 7.057608 5.38290 6.13925 41.9592 100 b
Here you see the advantage of data.table
data:
dt <- read.table(text = "C1 C2
a 11
a 2
a 2
b 2
b 34
c 2
c 4
c 1
d 4
",header = T)
Using ave
with(df, which(as.logical(ave(seq_along(C1), C1,
FUN = function(x) x == x[1]))))
#[1] 1 4 6 9
Here is my data:
df1<-read.table(text= "Group
11Z-23456
12B-10000
22M-2000
12M-1100
33G-100",header=TRUE)
I want to get this data:
A B C Code
1 1 Z 23456
1 2 B 10000
2 2 M 2000
1 2 M 1100
3 3 G 100
As you can see from my data, I want to separate the values in the Group This is just an small sample, but please consider for a large sample.
Here is a base R code that you can have a try
l <- strsplit(as.character(df1$Group),split = "-")
dfout <- setNames(data.frame(t(mapply(c, strsplit(sapply(l, `[[`,1),split = ""),lapply(l, `[[`,2)))),
c("A","B","C","Code"))
or using tidyr package
library(tidyr)
df1 %>%
separate(Group,c("X","Code"),sep = "-") %>%
separate(X,c("A","B","C"),sep = 1:3)
such that
> dfout
A B C Code
1 1 1 Z 23456
2 1 2 B 10000
3 2 2 M 2000
4 1 2 M 1100
5 3 3 G 100
Using `data.table:
library(data.table)
setDT(df1)
df1[, c("ABC", "Code") := tstrsplit(Group, "-")]
df1[, c("A", "B", "C") := tstrsplit(ABC, "")]
df1[, c("ABC", "Group") := NULL]
df1
# Code A B C
# 1: 23456 1 1 Z
# 2: 10000 1 2 B
# 3: 2000 2 2 M
# 4: 1100 1 2 M
# 5: 100 3 3 G
I have a DF like this:
ID Product
1 1
1 2
2 2
2 3
2 4
3 1
3 2
4 3
Now I need in R a possibility like here:
finding products that customers bought together
My output should look like this:
ProductX ProductY Times
1 2 2
2 3 1
3 4 1
Nice would be a matrix output lile this:
[,1] [,2] [,3] [,4]
[1,] 1 2 0 0
[2,] 2 1 1 0
[3,] 0 0 1 1
[4,] 0 0 1 1
I have tried it with the reshape2 package but I don't know how to get this output form.
Here's an answer using data.table and non-equi self-join. Edit: added allow.cartesian = TRUE to allow it work with large datasets.
library(data.table)
dt[dt
, on = .(ID = ID, Product < Product)
, .(ProductX = x.Product, ProductY = i.Product)
, nomatch = 0L
, allow.cartesian = TRUE
][, .N, by = .(ProductX, ProductY)]
ProductX ProductY N
1: 1 2 2
2: 2 3 1
3: 2 4 1
4: 3 4 1
Dplyr equivalent:
library(dplyr)
inner_join(tib, tib, by = 'ID')%>%
filter(Product.x < Product.y)%>%
count(Product.x, Product.y)
Product.x Product.y n
<dbl> <dbl> <int>
1 1 2 2
2 2 3 1
3 2 4 1
4 3 4 1
And here's a base R version as well:
aggregate(ID ~ Product.x + Product.y
, data = merge(df, df, by = 'ID')
, subset = Product.x < Product.y
, FUN = length)
# Need to change the names from ID to n
Product.x Product.y ID
1 1 2 2
2 2 3 1
3 2 4 1
4 3 4 1
Performance:
Unit: milliseconds
expr min lq mean median uq max neval
dt_way 3.9149 4.29330 4.593209 4.6597 4.80210 6.2326 100
dplyr_inner_join 1.8218 1.91510 2.058864 2.0572 2.16205 3.0157 100
dplyr_tidyr 13.8107 14.15735 16.020262 14.3571 14.78975 127.9654 100
base_agg 2.3393 2.51215 2.586652 2.5804 2.63865 3.4415 100
n_IDs <- 1E3
n_Sims <- 1E5
ID_big <- sample(1:n_IDs, n_Sims, replace = TRUE)
Product_big <- sample(1:n_Sims, n_Sims, replace = TRUE)
: seconds
expr min lq mean median uq max neval
dt_way 1.633111 1.904460 1.998192 1.986452 2.110937 2.308671 10
dplyr_inner_join 5.606322 6.361026 6.574015 6.606423 6.839273 7.198770 10
dplyr_tidyr 8.385418 9.350730 10.127512 10.372830 10.675809 11.462403 10
Data:
ID <- c(1,1,2,2,2,3,3,4)
Product <- c(1,2,2,3,4,1,2,3)
dt <- data.table(ID, Product)
tib <- tibble(ID, Product)
df <- data.frame(ID, Product)
A different dplyr and tidyr possibility could be:
df %>%
group_by(ID) %>%
expand(Product, Product) %>%
filter(Product < Product1) %>%
ungroup() %>%
count(Product, Product1)
Product Product1 n
<int> <int> <int>
1 1 2 2
2 2 3 1
3 2 4 1
4 3 4 1
However, I'm not sure what is the expected output for IDs that bought the same pair of products together (if there could be such a scenario) on multiple occasions.
Here is one option using dplyr and tidyr. We group_by ID and create list of combinations taking a pair of Product at a time. For ID which have one Product we remove the NA values and finally count the combinations of Product.
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
summarise(new_col = if (n() == 1) list(as.character(Product)) else
list(combn(sort(Product), 2, paste0, collapse = ","))) %>%
unnest() %>%
separate(new_col, c("ProductX", "ProductY"), sep = ",", fill = "right") %>%
na.omit %>%
count(ProductX, ProductY)
# A tibble: 4 x 3
# ProductX ProductY n
# <chr> <chr> <int>
#1 1 2 2
#2 2 3 1
#3 2 4 1
#4 3 4 1
How can I create a new data frame with the smallest values group by a column.
For example this df:
df <- read.table(header = TRUE, text = 'Gene Value
A 12
A 10
B 3
B 0
B 6
C 1
D 0
D 4')
Now with:
test <- setDT(df)[, .SD[which.min(Value)], by=Gene]
I get this:
> test
Gene Value
1: A 10
2: B 0
3: C 1
4: D 0
But how can I use a second condition for Value > 0 here? I want to have this output:
> test
Gene Value
1: A 10
2: B 3
3: C 1
4: D 4
Could do:
setDT(df)[, .(Value = min(Value[Value > 0])), by=Gene]
Output:
Gene Value
1: A 10
2: B 3
3: C 1
4: D 4
Using tidyverse you can group, filter and then summarize the min value:
library(tidyverse)
df2 <- df %>%
group_by(Gene) %>%
filter(Value != 0) %>%
summarise(Value = min(Value))
# A tibble: 4 x 2
Gene Value
<fct> <dbl>
1 A 10
2 B 3
3 C 1
4 D 4
Using aggregate from base R
aggregate(Value ~ Gene, subset(df, Value > 0), min)
# Gene Value
#1 A 10
#2 B 3
#3 C 1
#4 D 4
I have a dataframe (might not be sorted like this) that looks like this:
Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10
And now I want a new column that counts how many rows per Group that have a value that falls within a fixed range of the value in each row (let's say for this example that it has to be between 2 less than the current row's value and the actual value, inclusive). So the result would be
Group Value New Count
A 1 1 (because there is only 1 row in Group A between -1 and 1, this row)
A 5 1 (because there is only 1 row in Group A between 3 and 5, this row)
A 6 2 (because there are 2 rows in Group A between 4 and 6)..and so on
A 11 1
B 3 1
B 4 2
B 5 3
B 10 1
I have seen some answers with respect to running total counters within a group, etc, but I haven't come across this situation in my searching on SO...
Another approach is to use a non-equi join and group on the join conditions:
library(data.table)
setDT(DF)[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
Group Value New.Count
1: A 1 1
2: A 5 1
3: A 6 2
4: A 11 1
5: B 3 1
6: B 4 2
7: B 5 3
8: B 10 1
Data
library(data.table)
DF <- fread(
" Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10"
)
I found a way looping, not sure how to do otherwise :
Df <- data.frame(list(Value = c(1,5,8,11,3,4,5,10), Group = c("A","A","A","A","B","B","B","B")))
for (i in 1:dim(Df)[1])
{Df$newcount[i] <- sum(as.numeric(Df$Value <=Df$Value[i] & Df$Value >= Df$Value[i]-2 & Df$Group == Df$Group[i] )) }
It loop on each row and count the conditions you were saying : value between the value and value - 2, and in the same group.
I was looking for a data.table way but didn't managed it.
the output :
Value Group newcount
1 1 A 1
2 5 A 1
3 8 A 1
4 11 A 1
5 3 B 1
6 4 B 2
7 5 B 3
8 10 B 1
Based on what you started (as mentionned in your comment), here is loop to do it
df <- data.frame(Group = c(rep("A", 4), rep("B", 4)),
Value = c(1, 5, 6, 11, 3, 4, 5, 10))
require(dplyr)
for(i in seq_along(df$Value)){
df$NewCount[i] <- nrow(df %>% filter(Group == Group[i] &
Value <= Value[i] &
Value >= Value[i]-2))
}
You can achieve this with purrr, but maybe there is a more succinct way. We first create a new variable with the range we will search. Next we find all unique values for the given group. For the result we sum the count of all values which fall into the search range. We can wrap this in a function and re-use in a convenient way.
library(tidyverse)
find_counts <- function(x, range = 2) {
search_range <- map(x, ~seq(.x-range, .x, 1))
unique_vals <- list(x)
map2_int(unique_vals, search_range, ~sum(.x %in% .y))
}
Df %>%
group_by(Group) %>%
mutate(result = find_counts(Value))
#> # A tibble: 8 x 3
#> # Groups: Group [2]
#> Group Value result
#> <fctr> <int> <dbl>
#> 1 A 1 1
#> 2 A 5 1
#> 3 A 8 1
#> 4 A 11 1
#> 5 B 3 1
#> 6 B 4 2
#> 7 B 5 3
#> 8 B 10 1
Results from microbenchmark::microbenchmark with the following data:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
Unit: milliseconds
expr min lq mean median uq max neval cld
ANG 1607.59370 1645.93364 1776.582546 1709.976584 1822.011283 2603.61574 30 c
ThomasK 15.30110 16.11919 19.040010 17.238959 19.550713 54.30369 30 a
denis 155.92567 165.73500 182.563020 171.147209 204.508171 253.26394 30 b
uwe 2.15669 2.46198 3.207837 2.570449 3.114574 13.28832 30 a
Data
Df <- read.table(text = " Group Value
A 1
A 5
A 8
A 11
B 3
B 4
B 5
B 10", header = T)
Only base R:
count_in_range = function(x){
delta = 2
vapply(x,
FUN = function(value) sum(x>=(value - delta) & x<=value, na.rm = TRUE),
FUN.VALUE = numeric(1)
)
}
dfs$newcount = ave(dfs$Value, dfs$Group, FUN = count_in_range)
dfs
# Group Value newcount
# 1 A 1 1
# 2 A 5 1
# 3 A 6 2
# 4 A 11 1
# 5 B 3 1
# 6 B 4 2
# 7 B 5 3
# 8 B 10 1
Benchmark with data.table:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
library(data.table)
library(microbenchmark)
DT = as.data.table(DF)
delta = 2
microbenchmark(
datatable = {
DT[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
},
ave = {
DF$newcount = ave(DF$Value, DF$Group, FUN = count_in_range)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# datatable 1424.814 1438.3355 1492.9422 1459.2175 1512.100 1914.575 100
# ave 712.708 737.1955 849.0507 756.7265 789.327 3583.369 100
all.equal(DF$newcount, DT$New.Count) # TRUE