Related
a b #Encounter
1 112233 1
2 334455 1
1 112233 2
3 445566 1
2 334455 2
2 334455 3
3 445566 2
3 445566 3
3 445566 4
How would I calculate #Encounter, given column a and b, on R?
The Excel code would be: =countifs(a(Range), a, b(Range), b)
An option in base R would be to use ave
df1$Encounter <- with(df1, ave(seq_along(a), a, b, FUN = seq_along))
df1$Encounter
#[1] 1 1 2 1 2 3 2 3 4
Or in data.table
library(data.table)
setDT(df1)[, Encounter := rowid(a, b)]
data
df1 <- structure(list(a = c(1L, 2L, 1L, 3L, 2L, 2L, 3L, 3L, 3L), b = c(112233L,
334455L, 112233L, 445566L, 334455L, 334455L, 445566L, 445566L,
445566L)), row.names = c(NA, -9L), class = "data.frame")
This question already has answers here:
Convert data from long format to wide format with multiple measure columns
(6 answers)
Closed 4 years ago.
I want to reshape a long dataframe to a wide one. That is, I want to go from this:
file label val1 val2
1 red A 12 3
2 red B 4 2
3 red C 5 8
4 green A 3 3
5 green B 6 5
6 green C 9 6
7 blue A 3 3
8 blue B 1 2
9 blue C 4 6
to this:
file value1_A value1_B value1_C value2_A value2_B value2_C
1 red 12 4 5 3 2 8
2 green 3 6 9 3 5 6
3 blue 3 1 4 3 2 6
My best attempt thus far is as follows:
library(tidyverse)
dat <-
structure(list(file = structure(c(3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L),
.Label = c("blue", "green", "red"),
class = "factor"),
label = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L),
.Label = c("A", "B", "C"),
class = "factor"),
val1 = c(12L, 4L, 5L, 3L, 6L, 9L, 3L, 1L, 4L),
val2 = c(3L, 2L, 8L, 3L, 5L, 6L, 3L, 2L, 6L)),
class = "data.frame", row.names = c(NA, -9L))
dat %>%
group_by(file) %>%
mutate(values1 = paste('value1', label, sep='_'),
values2 = paste('value2', label, sep='_')) %>%
spread(values1, val1) %>%
spread(values2, val2) %>%
select(-label)
# # A tibble: 9 x 7
# # Groups: file [3]
# file value1_A value1_B value1_C value2_A value2_B value2_C
# <fct> <int> <int> <int> <int> <int> <int>
# 1 blue 3 NA NA 3 NA NA
# 2 blue NA 1 NA NA 2 NA
# 3 blue NA NA 4 NA NA 6
# 4 green 3 NA NA 3 NA NA
# 5 green NA 6 NA NA 5 NA
# 6 green NA NA 9 NA NA 6
# 7 red 12 NA NA 3 NA NA
# 8 red NA 4 NA NA 2 NA
# 9 red NA NA 5 NA NA 8
The output is unsatisfactory since what should be on one row occupies three, with multiple 'NA'. This seems to be due to using spread twice, but I don't know how else to achieve the result I desire. I'd very much appreciate any advice on how to do this.
Many thanks in advance,
-R
Here's a way
library(tidyverse)
dat %>%
# first move to long form so we can
# see the original column names as strings
gather("variable_name", "value", contains("val")) %>%
# create the new column names from the variable name and the label
mutate(new_column_name = paste(variable_name, label, sep="_")) %>%
# get rid of the pieces we used to make the column names
select(-label, -variable_name) %>%
# now spread
spread(new_column_name, value)
here's the data.table way. all in one line of code...
library( data.table )
dcast( setDT( dat ), file ~ label, value.var = c("val1", "val2"))
# file val1_A val1_B val1_C val2_A val2_B val2_C
# 1: blue 3 1 4 3 2 6
# 2: green 3 6 9 3 5 6
# 3: red 12 4 5 3 2 8
I have this dataset:
ID Set Type Count
1 1 1 A NA
2 2 1 R NA
3 3 1 R NA
4 4 1 U NA
5 5 1 U NA
6 6 1 U NA
7 7 2 A NA
8 8 3 R NA
9 9 3 R NA
As dputs:
mystart <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("ID", "Set", "Type",
"Count"), class = "data.frame", row.names = c(NA, -9L))
By using dplyr package how can I obtain this:
ID Set Type Count
1 1 1 A A1
2 2 1 R A1R1
3 3 1 R A1R2
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1
8 8 3 R R1
9 9 3 R R2
Again dputs:
myend <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 1L, 7L, 8L), .Label = c("A1", "A1R1", "A1R2",
"A1R2U1", "A1R2U2", "A1R2U3", "R1", "R2"), class = "factor")), .Names = c("ID",
"Set", "Type", "Count"), class = "data.frame", row.names = c(NA,
-9L))
In short, I want to count the observations of the column "type" within column "set" and print this count(text) cumulatively.
Examining similar posts, I got closely to this:
myend <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = c(1L,
1L, 2L, 1L, 2L, 3L, 1L, 1L, 2L)), .Names = c("ID", "Set", "Type",
"Count"), class = "data.frame", row.names = c(NA, -9L))
With the code:
library(dplyr)
myend <- read.table("mydata.txt", header=TRUE, fill=TRUE)
myend %>%
group_by(Set, Type) %>%
mutate(Count = seq(n())) %>%
ungroup(myend)
Thank you very much for your help,
Base R version :
aggregateGroup <- function(x){
vecs <- Reduce(f=function(a,b){ a[b] <- sum(a[b],1L,na.rm=TRUE); a },
init=integer(0),
as.character(x),
accumulate = TRUE)
# vecs is a list with something like this :
# [[1]]
# integer(0)
# [[2]]
# A
# 1
# [[3]]
# A R
# 1 1
# ...
# so we simply turn those vectors into characters using vapply and paste
# (excluding the first)
vapply(vecs,function(y) paste0(names(y),y,collapse=''),FUN.VALUE='')[-1]
}
split(mystart$Count,mystart$Set) <- lapply(split(mystart$Type,mystart$Set), aggregateGroup)
> mystart
ID Set Type Count
1 1 1 A A1
2 2 1 R A1R1
3 3 1 R A1R2
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1
8 8 3 R R1
9 9 3 R R2
A dplyr version:
mystart %>%
group_by(Set) %>%
mutate(Count = paste0('A', cumsum(Type == 'A'),
'R', cumsum(Type == 'R'),
'U', cumsum(Type == 'U'))) %>%
ungroup()
Which yields
# A tibble: 9 x 4
ID Set Type Count
<int> <int> <chr> <chr>
1 1 1 A A1R0U0
2 2 1 R A1R1U0
3 3 1 R A1R2U0
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1R0U0
8 8 3 R A0R1U0
9 9 3 R A0R2U0
If you want to omit the variables with count zero, you'd need to wrap a function around it like so
mygroup <- function(lst) {
name <- names(lst)
vectors <- lapply(seq_along(lst), function(i) {
x <- lst[[i]]
char <- name[i]
x <- ifelse(x == 0, "", paste0(char, x))
return(x)
})
return(do.call("paste0", vectors))
}
mystart %>%
group_by(Set) %>%
mutate(Count = mygroup(list(A = cumsum(Type == 'A'),
R = cumsum(Type == 'R'),
U = cumsum(Type == 'U')))) %>%
ungroup()
This yields
# A tibble: 9 x 4
ID Set Type Count
<int> <int> <chr> <chr>
1 1 1 A A1
2 2 1 R A1R1
3 3 1 R A1R2
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1
8 8 3 R R1
9 9 3 R R2
One line solve with data.table
you gotta first do
require(data.table)
mystart <- as.data.table(mystart)
then just use one line
mystart[, .(Type,
count = paste0(
'A',
cumsum(Type == 'A'),
'R',
countR = cumsum(Type == 'R'),
'U',
countU = cumsum(Type == 'U')
)),
by = c('Set')]
first you want cumsum each type and paste them together by 'set'
cumsum(Type=='A') equals the count, since when Type==A, it's 1, otherwise it's 0.
you wanted to paste them into one column also. So, paste0() is good to use.
you still wanted the Type column, so I included Type in the line.
The output:
Set Type count
1: 1 A A1R0U0
2: 1 R A1R1U0
3: 1 R A1R2U0
4: 1 U A1R2U1
5: 1 U A1R2U2
6: 1 U A1R2U3
7: 2 A A1R0U0
8: 3 R A0R1U0
9: 3 R A0R2U0
Hope this helps.
btw, if you want count 0 ignored, you gotta design some if-esle clause yourself.
basically you want this: if cumsum(something) ==0, NULL, esle paste0('something', cumsum(something)), then you paste0() them together.
It's gonna get nasty, I'm not writing it. you get the idea
Here's a base solution.
We can paste raw letters toseq_along of letter groups to get the last 2 characters, then paste the result to the last element of the previous result, using Reduce.
On top of this we use ave to compute by group.
fun <- function(x,y) paste0(x[length(x)],y,seq_along(y))
mystart$Count <- ave(as.character(mystart$Type),mystart$Set,
FUN = function(x) unlist(Reduce(fun,split(x,x),init=NULL,accumulate = TRUE)))
# ID Set Type Count
# 1 1 1 A A1
# 2 2 1 R A1R1
# 3 3 1 R A1R2
# 4 4 1 U A1R2U1
# 5 5 1 U A1R2U2
# 6 6 1 U A1R2U3
# 7 7 2 A A1
# 8 8 3 R R1
# 9 9 3 R R2
Details
split(x,x) splits letters as shown here for first Set:
with(subset(mystart,Set==1),split(Type,Type))
# $A
# [1] "A"
#
# $R
# [1] "R" "R"
#
# $U
# [1] "U" "U" "U"
Then fun does this type of operations, helped by Reduce :
fun(NULL,"A") # [1] "A1"
fun("A1",c("R","R")) # [1] "A1R1" "A1R2"
fun(c("A1R1","A1R2"),c("U","U","U")) # [1] "A1R2U1" "A1R2U2" "A1R2U3"
Bonus solution
This other base solution, using rle and avoiding split gives the same output for given example (and whenever Type values are grouped in Sets), but not with mystart2 <- rbind(mystart,mystart) for instance.
fun2 <- function(x){
rle_ <- rle(x)
suffix <- paste0(x,sequence(rle_$length))
prefix <- unlist(mapply(rep,
lag(unlist(
Reduce(paste0,paste0(rle_$values,rle_$lengths),accumulate=TRUE)
),rle_$lengths[1]),
each=rle_$lengths))
prefix[is.na(prefix)] <- ""
paste0(prefix,suffix)
}
mystart$Count2 <-ave(as.character(mystart$Type), mystart$Set,FUN=fun2)
Many elegant solutions have been provided for the problem. Still I was looking for something dplyr way (without-cumsum on fixed types). The function is generic enough to handle additional values of Type.
A solution with help of a custom function as:
library(dplyr)
mystart %>% group_by(Set, Type) %>%
mutate(type_count = row_number()) %>%
mutate(TypeMod = paste0(Type,type_count)) %>%
group_by(Set) %>%
mutate(Count = cumCat(TypeMod, type_count)) %>%
select(-type_count, -TypeMod)
cumCat <- function(x, y){
retVal <- character(length(x))
prevVal = ""
lastGrpVal = ""
for ( i in seq_along(x)){
if(y[i]==1){
lastGrpVal = prevVal
}
retVal[i] = paste0(lastGrpVal,x[i])
prevVal = retVal[i]
}
retVal
}
# # Groups: Set [3]
# ID Set Type Count
# <int> <int> <fctr> <chr>
# 1 1 1 A A1
# 2 2 1 R A1R1
# 3 3 1 R A1R2
# 4 4 1 U A1R2U1
# 5 5 1 U A1R2U2
# 6 6 1 U A1R2U3
# 7 7 2 A A1
# 8 8 3 R R1
# 9 9 3 R R2
I have a data table where I would like to calculate the mean of the group of variables starting with "amount" for each of the ids.
The number of variable starting with amount can vary, but they are well over 100 in my real data (and some variables have NA values).
id variable amountA amountB amountC amountD
1 A 8 7 6 2
2 B 6 2 1 2
3 C 6 6 9 4
4 D 1 6 2 7
In my data I have tried unsucessfully:
DT[,testvar := apply(DT[ ,grepl("amount",names(DT))],1,mean)]
DT[,testvar := mean(DT[ ,grepl("amount",names(DT))],na.rm=TRUE), by = idvar]
I'm trying to work it out with the .EACHI but I haven't figure it out yet. Any idea or comment hugely appreciated.
Sample table:
structure(list(id = 1:4, variable = structure(1:4, .Label = c("A",
"B", "C", "D"), class = "factor"), amountA = c(8L, 6L, 6L, 1L
), amountB = c(7L, 2L, 6L, 6L), amountC = c(6L, 1L, 9L, 2L),
amountD = c(2L, 2L, 4L, 7L)), .Names = c("id", "variable",
"amountA", "amountB", "amountC", "amountD"), class = "data.frame", row.names = c(NA,
-4L))
Here is a possible solution taking some of Arun's suggestions:
DT[, testvar:=rowMeans(.SD, na.rm=T), .SDcols=grep("^amount", names(DT), value=T)]
Produces:
id variable amountA amountB amountC amountD testvar
1: 1 A 8 7 6 2 5.75
2: 2 B 6 2 1 2 2.75
3: 3 C 6 6 9 4 6.25
4: 4 D 1 6 2 7 4.00
We define what columns we want to be part of the internal .SD object with .SDcols and grep, and then we just rowSums the resulting .SD.
In more recent versions of data.table you can shortcut this by using patterns in .SDcols:
DT[, testvar := rowMeans(.SD, na.rm = TRUE), .SDcols = patterns('amount')]
I have a data.table or data.frame that looks like the following:
Name Person Date
A 1 1/1/2004
A 2 1/1/2004
A 2 1/3/2004
A 3 1/1/2004
A 3 1/3/2004
A 3 1/9/2004
B 4 1/7/2004
B 5 1/7/2004
B 5 1/10/2004
B 6 1/7/2004
B 6 1/10/2004
B 6 1/17/2004
Here, I am trying to create a new data table that has "NA's" for the dates if they are not the maximum by person. Basically, I am trying to get the data table to look like:
Name Person Date
A 1 1/1/2004
A 2 "NA"
A 2 1/3/2004
A 3 "NA"
A 3 "NA"
A 3 1/9/2004
B 4 1/7/2004
B 5 "NA"
B 5 1/10/2004
B 6 "NA"
B 6 "NA"
B 6 1/17/2004
Basically, the algorithm I thought of was to look at each grouping by person. If there is only one element for the grouping by person, then that one value is the maximum and hence we let it stay there. But, for example, in grouping by person 2, the maximum is 1/3/2004, so we let 1/1/2004 be "NA".
The only way I can think of doing this is to find the index of the data table corresponding to the maximal value by group (Person) and then to create a new vector of all NA's, then replace with the index of the maximal values.
The code would look like:
which.max(data$Date, by=data$Person)
This somehow doesn't work for me, but either way this code looks like it might be time intensive, especially if my data set is 100 million rows. Would there be a fast implementation for large data sets, especially working in the data.table package? Thanks!
Using data.table:
#dat <- as.data.table(dat)
#dat$Date <- as.Date(dat$Date,format="%m/%d/%Y")
dat[dat[, Date != max(Date) , by=Person][,V1], Date := NA]
dat
# Name Person Date
#1: A 1 2004-01-01
#2: A 2 <NA>
#3: A 2 2004-01-03
#4: A 3 <NA>
#5: A 3 <NA>
#6: A 3 2004-01-09
#7: B 4 2004-01-07
#8: B 5 <NA>
#9: B 5 2004-01-10
#10: B 6 <NA>
#11: B 6 <NA>
#12: B 6 2004-01-17
This is a good use case for ave() which allows you to apply a function to the values for each person. If this is your sample data
dd<-structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Person = c(1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L, 6L, 6L, 6L
), Date = structure(c(12418, 12418, 12420, 12418, 12420,
12426, 12424, 12424, 12427, 12424, 12427, 12434), class = "Date")),
.Names = c("Name", "Person", "Date"),
row.names = c(NA, -12L), class = "data.frame")
then use
with(dd, ave(Date, Person,FUN=function(x) {x[x!=max(x)]<-NA; x}))
# [1] "2004-01-01" NA "2004-01-03" NA NA
# [6] "2004-01-09" "2004-01-07" NA "2004-01-10" NA
# [11] NA "2004-01-17"
Another way using data.table (assuming that you don't have multiple max values per group)
dat[dat[, order(Date)!=.N, by=Person]$V1, Date:= NA]
dat
# Name Person Date
#1: A 1 2004-01-01
#2: A 2 <NA>
#3: A 2 2004-01-03
#4: A 3 <NA>
#5: A 3 <NA>
#6: A 3 2004-01-09
#7: B 4 2004-01-07
#8: B 5 <NA>
#9: B 5 2004-01-10
#10: B 6 <NA>
#11: B 6 <NA>
#12: B 6 2004-01-17
If you have multiple max values:
dat[dat[, rank(Date,ties.method="max")!=.N, by=Person]$V1, Date:=NA]
To format the date
dat[dat[, order(Date)!=.N, by=Person]$V1, Date:= NA][,Date:=format(Date, "%m/%d/%Y")]