how to get index of first occurence of group in a column? - r

C1 C2
------
a 11
a 2
a 2
b 2
b 34
c 2
c 4
c 1
d 4
how can i get index of a groupname first occurence
for example: in column A first occurence of 'b' index is 4
like that i need to get all indexes of first occurence of group

With data.table package, you can get it with .I:
as.data.table(dtt)[, .(index = .I[1]), by = .(C1)]
# C1 index
# 1: a 1
# 2: b 4
# 3: c 6
# 4: d 9
If only indices are need:
which(!duplicated(dtt$C1))
[1] 1 4 6 9

library(dplyr)
df <- data.frame(C1 = c("a","a","a","b","b","c","c","c","d"),
C2 = c(11,2,2,2,34,2,4,1,4))
df %>%
mutate(r_number = row_number()) %>%
group_by(C1) %>%
summarise(index = min(r_number))
#> # A tibble: 4 x 2
#> C1 index
#> <chr> <int>
#> 1 a 1
#> 2 b 4
#> 3 c 6
#> 4 d 9

Try tapply + head like below
with(
df,
tapply(seq_along(C1), C1, head, 1)
)
which gives
a b c d
1 4 6 9
Or we can use aggregate
> aggregate(cbind(idx = seq_along(C1)) ~ C1, df, head, 1)
C1 idx
1 a 1
2 b 4
3 c 6
4 d 9

To add on the already present answers, with base R, using tapply:
dt$I <- 1:nrow(dt)
tapply(dt$I, dt$C1, function(x) x[1])
a b c d
1 4 6 9
If you want two column, the group and the index, with dplyr you could use cur_group_rows, the equivalent of .I in data.table, see https://dplyr.tidyverse.org/reference/context.html?q=grp#data-table
dt %>%
group_by(C1) %>%
summarise(cur_group_rows()[1])
# A tibble: 4 x 2
C1 index
<fct> <int>
1 a 1
2 b 4
3 c 6
4 d 9
A bit of comparison:
only the index
denis = function(){
tapply(dt$I, dt$C1, function(x) x[1])
}
mt1022 = function(){
which(!duplicated(dt$C1))
}
microbenchmark(mt1022(),denis())
Unit: microseconds
expr min lq mean median uq max neval cld
mt1022() 19.5 23.7 46.705 29.9 48.9 525.2 100 a
denis() 61.7 66.0 124.323 89.5 133.1 735.3 100 b
#mt1022 method is much faster
if you want the two column table:
library(dplyr)
library(data.table)
mt1022_datatable = function(){
as.data.table(dt)[, .(index = .I[1]), by = .(C1)]
}
jmpivette = function(){
dt %>%
mutate(r_number = row_number()) %>%
group_by(C1) %>%
summarise(r_number[1])
}
denis_dplyr = function(){
dt %>%
group_by(C1) %>%
summarise(index = cur_group_rows()[1])
}
microbenchmark(mt1022_datatable(),jmpivette(),denis_dplyr())
Unit: milliseconds
expr min lq mean median uq max neval cld
mt1022_datatable() 1.4469 1.72520 2.234030 2.01225 2.30720 8.9519 100 a
jmpivette() 6.6528 7.31915 10.029003 7.94435 8.89835 56.7763 100 c
denis_dplyr() 4.4943 4.92120 7.057608 5.38290 6.13925 41.9592 100 b
Here you see the advantage of data.table
data:
dt <- read.table(text = "C1 C2
a 11
a 2
a 2
b 2
b 34
c 2
c 4
c 1
d 4
",header = T)

Using ave
with(df, which(as.logical(ave(seq_along(C1), C1,
FUN = function(x) x == x[1]))))
#[1] 1 4 6 9

Related

Counting which products have been bought together

I have a DF like this:
ID Product
1 1
1 2
2 2
2 3
2 4
3 1
3 2
4 3
Now I need in R a possibility like here:
finding products that customers bought together
My output should look like this:
ProductX ProductY Times
1 2 2
2 3 1
3 4 1
Nice would be a matrix output lile this:
[,1] [,2] [,3] [,4]
[1,] 1 2 0 0
[2,] 2 1 1 0
[3,] 0 0 1 1
[4,] 0 0 1 1
I have tried it with the reshape2 package but I don't know how to get this output form.
Here's an answer using data.table and non-equi self-join. Edit: added allow.cartesian = TRUE to allow it work with large datasets.
library(data.table)
dt[dt
, on = .(ID = ID, Product < Product)
, .(ProductX = x.Product, ProductY = i.Product)
, nomatch = 0L
, allow.cartesian = TRUE
][, .N, by = .(ProductX, ProductY)]
ProductX ProductY N
1: 1 2 2
2: 2 3 1
3: 2 4 1
4: 3 4 1
Dplyr equivalent:
library(dplyr)
inner_join(tib, tib, by = 'ID')%>%
filter(Product.x < Product.y)%>%
count(Product.x, Product.y)
Product.x Product.y n
<dbl> <dbl> <int>
1 1 2 2
2 2 3 1
3 2 4 1
4 3 4 1
And here's a base R version as well:
aggregate(ID ~ Product.x + Product.y
, data = merge(df, df, by = 'ID')
, subset = Product.x < Product.y
, FUN = length)
# Need to change the names from ID to n
Product.x Product.y ID
1 1 2 2
2 2 3 1
3 2 4 1
4 3 4 1
Performance:
Unit: milliseconds
expr min lq mean median uq max neval
dt_way 3.9149 4.29330 4.593209 4.6597 4.80210 6.2326 100
dplyr_inner_join 1.8218 1.91510 2.058864 2.0572 2.16205 3.0157 100
dplyr_tidyr 13.8107 14.15735 16.020262 14.3571 14.78975 127.9654 100
base_agg 2.3393 2.51215 2.586652 2.5804 2.63865 3.4415 100
n_IDs <- 1E3
n_Sims <- 1E5
ID_big <- sample(1:n_IDs, n_Sims, replace = TRUE)
Product_big <- sample(1:n_Sims, n_Sims, replace = TRUE)
: seconds
expr min lq mean median uq max neval
dt_way 1.633111 1.904460 1.998192 1.986452 2.110937 2.308671 10
dplyr_inner_join 5.606322 6.361026 6.574015 6.606423 6.839273 7.198770 10
dplyr_tidyr 8.385418 9.350730 10.127512 10.372830 10.675809 11.462403 10
Data:
ID <- c(1,1,2,2,2,3,3,4)
Product <- c(1,2,2,3,4,1,2,3)
dt <- data.table(ID, Product)
tib <- tibble(ID, Product)
df <- data.frame(ID, Product)
A different dplyr and tidyr possibility could be:
df %>%
group_by(ID) %>%
expand(Product, Product) %>%
filter(Product < Product1) %>%
ungroup() %>%
count(Product, Product1)
Product Product1 n
<int> <int> <int>
1 1 2 2
2 2 3 1
3 2 4 1
4 3 4 1
However, I'm not sure what is the expected output for IDs that bought the same pair of products together (if there could be such a scenario) on multiple occasions.
Here is one option using dplyr and tidyr. We group_by ID and create list of combinations taking a pair of Product at a time. For ID which have one Product we remove the NA values and finally count the combinations of Product.
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
summarise(new_col = if (n() == 1) list(as.character(Product)) else
list(combn(sort(Product), 2, paste0, collapse = ","))) %>%
unnest() %>%
separate(new_col, c("ProductX", "ProductY"), sep = ",", fill = "right") %>%
na.omit %>%
count(ProductX, ProductY)
# A tibble: 4 x 3
# ProductX ProductY n
# <chr> <chr> <int>
#1 1 2 2
#2 2 3 1
#3 2 4 1
#4 3 4 1

Count unique elements in columns of dataframe

For the dataframe below, the there are 59 columns
circleid name birthday 56 more...
1 1 1
2 2 10
2 5 68
2 1 10
1 1 1
Result I want
circleid distinct_name distinct_birthday 56 more...
1 1 1
2 3 2
quiz <- read.csv("https://raw.githubusercontent.com/pranavn91/PhD/master/Expt/circles-removed-na.csv", header = T)
So far
ddply(quiz,~circleid,summarise,number_of_distinct_name=length(unique(name)))
This works for 1 column how do i get the full dataframe
columns <- colnames(quiz)
for (i in c(1:58)
{
final <- ddply(quiz,~circleid,summarise,number_of_distinct_name=length(unique(columns[i])))
}
with data.table you can run:
library(data.table)
quiz <- fread("https://raw.githubusercontent.com/pranavn91/PhD/master/Expt/circles-removed-na.csv", header = T)
unique_vals <- quiz[, lapply(.SD, uniqueN), by = circleid]
You can use dplyr:
result<-quiz%>%
group_by(circleid)%>%
summarise_all(n_distinct)
microbenchmark for data.table and dplyr:
microbenchmark(x1=quiz[, lapply(.SD, function(x) length(unique(x))), by = circleid],
x2=quiz%>%
group_by(circleid)%>%
summarise_all(n_distinct),times=100)
Unit: milliseconds
expr min lq mean median uq max neval cld
x1 150.06392 155.02227 158.75775 156.49328 158.38887 224.22590 100 b
x2 41.07139 41.90953 42.95186 42.54135 43.97387 49.91495 100 a
With package dplyr this is simple. The original answer had length(unique(.)) but #akrun pointed me to n_distinct(.) in a comment.
library(dplyr)
quiz %>%
group_by(circleid) %>%
summarise_all(n_distinct)
## A tibble: 2 x 3
#circleid name birthday
#<int> <int> <int>
# 1 1 1
# 2 2 3
Data.
quiz <- read.table(text = "
circleid name birthday
1 1 1
2 2 10
2 5 68
2 1 10
1 1 1
", header = TRUE)

How to sum time-series data rows by group?

My dataset looks like this:
block <- c(1,1,2,2,3,3,4,4)
treatment <- c(1,1,2,2,1,1,2,2)
type <- c("adult1","adult2","adult1","adult2","adult1","adult2","adult1","adult2")
t1 <- c(1,1,2,2,3,3,4,4)
t2 <- c(1,1,2,2,3,3,4,4)
t100 <- c(1,1,2,2,3,3,4,4)
df <- data.frame(block,treatment, type,t1,t2,t100)
I wish to sum the adults for each time point (t1,t2,t100) only with respect to the block. Here is what I want the final output to llok like
block <- c(1,2,3,4)
treatment <- c(1,2,1,2)
type <- c("adult","adult","adult","adult")
t1 <- c(2,4,6,8)
t2 <- c(2,4,6,8)
t100 <- c(2,4,6,8)
df <- data.frame(block,treatment,type,t1,t2,t100
)
Here is my attempt using aggregate function:
aggregate(df[,3:5], by = list(df$block), FUN = sum)
I get an error message saying that "arguments must be of the same length".
With aggregate you can use a formula to sum up t1:t100 and group by block and treatment:
df_final = aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, sum)
df_final$type1 = "adult"
Result:
block treatment t1 t2 t100 type1
1 1 1 2 2 2 adult
2 3 1 6 6 6 adult
3 2 2 4 4 4 adult
4 4 2 8 8 8 adult
Or you can do this with dplyr:
library(dplyr)
df %>%
group_by(block, treatment) %>%
summarize_at(vars(t1:t100), sum) %>%
mutate(type1 = "adult")
or
df %>%
group_by(block, treatment) %>%
summarize_at(vars(2:4), sum) %>%
mutate(type1 = "adult")
Result:
# A tibble: 4 x 6
# Groups: block [4]
block treatment t1 t2 t100 type1
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 2 2 2 adult
2 2 2 4 4 4 adult
3 3 1 6 6 6 adult
4 4 2 8 8 8 adult
You can also use data.table, which supports column indexing:
library(data.table)
setDT(df)[, lapply(.SD, sum), by=.(block, treatment), .SDcols=4:6]
Result:
block treatment t1 t2 t100
1: 1 1 2 2 2
2: 2 2 4 4 4
3: 3 1 6 6 6
4: 4 2 8 8 8
Solution in base R:
df <- cbind.data.frame(
aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, FUN = sum),
type = "adult");
# block treatment t1 t2 t100 type
#1 1 1 2 2 2 adult
#2 3 1 6 6 6 adult
#3 2 2 4 4 4 adult
#4 4 2 8 8 8 adult
Note: They key is here to cbind relevant columns.
Benchmarking comparison
Below are the results from a microbenchmark of all three solutions (base R, dplyr, data.table).
# Sample dataframe with 1000 rows
df <- cbind.data.frame(
block = rep(seq(1, 1000), each = 2),
treatment = rep(c(1, 1, 2, 2), length.out = 250),
type = rep(c("adult1", "adult2"), length.out = 500),
t1 = rep(seq(1, 1000), each = 2),
t2 = rep(seq(1, 1000), each = 2),
t100 = rep(seq(1, 1000), each = 2));
# Benchmarking results
require(microbenchmark);
require(dplyr);
require(magrittr);
require(data.table);
microbenchmark(
baseR = cbind.data.frame(
aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, FUN = sum),
type = "adult"),
dplyr = df %>%
group_by(block, treatment) %>%
summarize_at(vars(t1:t100), sum) %>%
mutate(type1 = "adult"),
datatable = setDT(df)[, lapply(.SD, sum), by=.(block, treatment), .SDcols=4:6]
)
#Unit: microseconds
# expr min lq mean median uq max neval
# baseR 13817.627 14040.4835 14931.4202 14278.8220 15026.413 42347.511 100
# dplyr 6698.983 7076.6360 8459.7861 7240.1680 7486.245 73401.747 100
# datatable 463.837 500.6555 663.5425 576.3075 597.443 9015.664 100

R - Adding a count to a dataframe based on values in current row and other rows

I have a dataframe (might not be sorted like this) that looks like this:
Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10
And now I want a new column that counts how many rows per Group that have a value that falls within a fixed range of the value in each row (let's say for this example that it has to be between 2 less than the current row's value and the actual value, inclusive). So the result would be
Group Value New Count
A 1 1 (because there is only 1 row in Group A between -1 and 1, this row)
A 5 1 (because there is only 1 row in Group A between 3 and 5, this row)
A 6 2 (because there are 2 rows in Group A between 4 and 6)..and so on
A 11 1
B 3 1
B 4 2
B 5 3
B 10 1
I have seen some answers with respect to running total counters within a group, etc, but I haven't come across this situation in my searching on SO...
Another approach is to use a non-equi join and group on the join conditions:
library(data.table)
setDT(DF)[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
Group Value New.Count
1: A 1 1
2: A 5 1
3: A 6 2
4: A 11 1
5: B 3 1
6: B 4 2
7: B 5 3
8: B 10 1
Data
library(data.table)
DF <- fread(
" Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10"
)
I found a way looping, not sure how to do otherwise :
Df <- data.frame(list(Value = c(1,5,8,11,3,4,5,10), Group = c("A","A","A","A","B","B","B","B")))
for (i in 1:dim(Df)[1])
{Df$newcount[i] <- sum(as.numeric(Df$Value <=Df$Value[i] & Df$Value >= Df$Value[i]-2 & Df$Group == Df$Group[i] )) }
It loop on each row and count the conditions you were saying : value between the value and value - 2, and in the same group.
I was looking for a data.table way but didn't managed it.
the output :
Value Group newcount
1 1 A 1
2 5 A 1
3 8 A 1
4 11 A 1
5 3 B 1
6 4 B 2
7 5 B 3
8 10 B 1
Based on what you started (as mentionned in your comment), here is loop to do it
df <- data.frame(Group = c(rep("A", 4), rep("B", 4)),
Value = c(1, 5, 6, 11, 3, 4, 5, 10))
require(dplyr)
for(i in seq_along(df$Value)){
df$NewCount[i] <- nrow(df %>% filter(Group == Group[i] &
Value <= Value[i] &
Value >= Value[i]-2))
}
You can achieve this with purrr, but maybe there is a more succinct way. We first create a new variable with the range we will search. Next we find all unique values for the given group. For the result we sum the count of all values which fall into the search range. We can wrap this in a function and re-use in a convenient way.
library(tidyverse)
find_counts <- function(x, range = 2) {
search_range <- map(x, ~seq(.x-range, .x, 1))
unique_vals <- list(x)
map2_int(unique_vals, search_range, ~sum(.x %in% .y))
}
Df %>%
group_by(Group) %>%
mutate(result = find_counts(Value))
#> # A tibble: 8 x 3
#> # Groups: Group [2]
#> Group Value result
#> <fctr> <int> <dbl>
#> 1 A 1 1
#> 2 A 5 1
#> 3 A 8 1
#> 4 A 11 1
#> 5 B 3 1
#> 6 B 4 2
#> 7 B 5 3
#> 8 B 10 1
Results from microbenchmark::microbenchmark with the following data:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
Unit: milliseconds
expr min lq mean median uq max neval cld
ANG 1607.59370 1645.93364 1776.582546 1709.976584 1822.011283 2603.61574 30 c
ThomasK 15.30110 16.11919 19.040010 17.238959 19.550713 54.30369 30 a
denis 155.92567 165.73500 182.563020 171.147209 204.508171 253.26394 30 b
uwe 2.15669 2.46198 3.207837 2.570449 3.114574 13.28832 30 a
Data
Df <- read.table(text = " Group Value
A 1
A 5
A 8
A 11
B 3
B 4
B 5
B 10", header = T)
Only base R:
count_in_range = function(x){
delta = 2
vapply(x,
FUN = function(value) sum(x>=(value - delta) & x<=value, na.rm = TRUE),
FUN.VALUE = numeric(1)
)
}
dfs$newcount = ave(dfs$Value, dfs$Group, FUN = count_in_range)
dfs
# Group Value newcount
# 1 A 1 1
# 2 A 5 1
# 3 A 6 2
# 4 A 11 1
# 5 B 3 1
# 6 B 4 2
# 7 B 5 3
# 8 B 10 1
Benchmark with data.table:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
library(data.table)
library(microbenchmark)
DT = as.data.table(DF)
delta = 2
microbenchmark(
datatable = {
DT[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
},
ave = {
DF$newcount = ave(DF$Value, DF$Group, FUN = count_in_range)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# datatable 1424.814 1438.3355 1492.9422 1459.2175 1512.100 1914.575 100
# ave 712.708 737.1955 849.0507 756.7265 789.327 3583.369 100
all.equal(DF$newcount, DT$New.Count) # TRUE

Frequency of rows by ID

The data set contains three variables: id, sex, and grade (factor).
mydata <- data.frame(id=c(1,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4), sex=c(1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1),
grade=c("a","b","c","d","e", "x","y","y","x", "q","q","q","q", "a", "a", "a", NA, "b"))
For each ID, I need to see how many unique grades we have and then create a new column (call N) to record the grade frequency. For instance, for ID=1, we have five unique values for "grade", so N = 4; for ID=2, we have two unique values for "grade", so N = 2; for ID=4, we have two unique values for "grade" (ignore NA), so N = 2.
The final data set is
mydata <- data.frame(id=c(1,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4), sex=c(1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1),
grade=c("a","b","c","d","e", "x","y","y","x", "q","q","q","q", "a", "a", "a", NA, "b"))
mydata$N <- c(5,5,5,5,5,2,2,2,2,1,1,1,1,2,2,2,2,2)
New answer:
The uniqueN-function of data.table has a na.rm argument, which we can use as follows:
library(data.table)
setDT(mydata)[, n := uniqueN(grade, na.rm = TRUE), by = id]
which gives:
> mydata
id sex grade n
1: 1 1 a 5
2: 1 1 b 5
3: 1 1 c 5
4: 1 1 d 5
5: 1 1 e 5
6: 2 0 x 2
7: 2 0 y 2
8: 2 0 y 2
9: 2 0 x 2
10: 3 0 q 1
11: 3 0 q 1
12: 3 0 q 1
13: 3 0 q 1
14: 4 1 a 2
15: 4 1 a 2
16: 4 1 a 2
17: 4 1 NA 2
18: 4 1 b 2
Old answer:
With data.table you could do this as follows:
library(data.table)
setDT(mydata)[, n := uniqueN(grade[!is.na(grade)]), by = id]
or:
setDT(mydata)[, n := uniqueN(na.omit(grade)), by = id]
You could use the package data.table:
library(data.table)
setDT(mydata)
#I have removed NA's, up to you how to count them
mydata[,N_u:=length(unique(grade[!is.na(grade)])),by=id]
Very short, readable and fast. It can also be done in base-R:
#lapply(split(grade,id),...: splits data into subsets by id
#unlist: creates one vector out of multiple vectors
#rep: makes sure each ID is repeated enough times
mydata$N <- unlist(lapply(split(mydata$grade,mydata$id),function(x){
rep(length(unique(x[!is.na(x)])),length(x))
}
))
Because there was discussion on what is faster, let's do some benchmarking.
Given dataset:
> test1
Unit: milliseconds
expr min lq mean median uq max neval cld
length_unique 3.043186 3.161732 3.422327 3.286436 3.477854 10.627030 100 b
uniqueN 2.481761 2.615190 2.763192 2.738354 2.872809 3.985393 100 a
Larger dataset: (10000 observations, 1000 id's)
> test2
Unit: milliseconds
expr min lq mean median uq max neval cld
length_unique 11.84123 24.47122 37.09234 30.34923 47.55632 97.63648 100 a
uniqueN 25.83680 50.70009 73.78757 62.33655 97.33934 210.97743 100 b
A dplyr option that makes use of dplyr::n_distinct and its na.rm-argument:
library(dplyr)
mydata %>% group_by(id) %>% mutate(N = n_distinct(grade, na.rm = TRUE))
#Source: local data frame [18 x 4]
#Groups: id [4]
#
# id sex grade N
# (dbl) (dbl) (fctr) (int)
#1 1 1 a 5
#2 1 1 b 5
#3 1 1 c 5
#4 1 1 d 5
#5 1 1 e 5
#6 2 0 x 2
#7 2 0 y 2
#8 2 0 y 2
#9 2 0 x 2
#10 3 0 q 1
#11 3 0 q 1
#12 3 0 q 1
#13 3 0 q 1
#14 4 1 a 2
#15 4 1 a 2
#16 4 1 a 2
#17 4 1 NA 2
#18 4 1 b 2
Looks like we have several votes for data.table, but you could also use the base R function ave():
mydata$N <- ave(as.character(mydata$grade),mydata$id,
FUN = function(x) length(unique(x[!is.na(x)])))
use tapply and lookup table
mydata <- data.frame(id=c(1,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4),
sex=c(1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1),
grade=c("a","b","c","d","e", "x","y","y","x", "q",
"q","q","q", "a", "a", "a", NA, "b"))
uniqN <- tapply(mydata$grade, mydata$id, function(x) sum(!is.na(unique(x))))
mydata$N <- uniqN[mydata$id]
Here is a dplyr method. I kept the summary table separate for tidy reasons.
library(dplyr)
summary =
mydata %>%
distinct(id, grade) %>%
filter(grade %>% is.na %>% `!`) %>%
count(id)
mydata %>%
left_join(summary)

Resources