Cumulative Count Paste

Cumulative Count Paste - r

I have this dataset:
ID Set Type Count
1 1 1 A NA
2 2 1 R NA
3 3 1 R NA
4 4 1 U NA
5 5 1 U NA
6 6 1 U NA
7 7 2 A NA
8 8 3 R NA
9 9 3 R NA
As dputs:
mystart <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("ID", "Set", "Type",
"Count"), class = "data.frame", row.names = c(NA, -9L))
By using dplyr package how can I obtain this:
ID Set Type Count
1 1 1 A A1
2 2 1 R A1R1
3 3 1 R A1R2
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1
8 8 3 R R1
9 9 3 R R2
Again dputs:
myend <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 1L, 7L, 8L), .Label = c("A1", "A1R1", "A1R2",
"A1R2U1", "A1R2U2", "A1R2U3", "R1", "R2"), class = "factor")), .Names = c("ID",
"Set", "Type", "Count"), class = "data.frame", row.names = c(NA,
-9L))
In short, I want to count the observations of the column "type" within column "set" and print this count(text) cumulatively.
Examining similar posts, I got closely to this:
myend <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = c(1L,
1L, 2L, 1L, 2L, 3L, 1L, 1L, 2L)), .Names = c("ID", "Set", "Type",
"Count"), class = "data.frame", row.names = c(NA, -9L))
With the code:
library(dplyr)
myend <- read.table("mydata.txt", header=TRUE, fill=TRUE)
myend %>%
group_by(Set, Type) %>%
mutate(Count = seq(n())) %>%
ungroup(myend)
Thank you very much for your help,

Base R version :
aggregateGroup <- function(x){
vecs <- Reduce(f=function(a,b){ a[b] <- sum(a[b],1L,na.rm=TRUE); a },
init=integer(0),
as.character(x),
accumulate = TRUE)
# vecs is a list with something like this :
# [[1]]
# integer(0)
# [[2]]
# A
# 1
# [[3]]
# A R
# 1 1
# ...
# so we simply turn those vectors into characters using vapply and paste
# (excluding the first)
vapply(vecs,function(y) paste0(names(y),y,collapse=''),FUN.VALUE='')[-1]
}
split(mystart$Count,mystart$Set) <- lapply(split(mystart$Type,mystart$Set), aggregateGroup)
> mystart
ID Set Type Count
1 1 1 A A1
2 2 1 R A1R1
3 3 1 R A1R2
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1
8 8 3 R R1
9 9 3 R R2

A dplyr version:
mystart %>%
group_by(Set) %>%
mutate(Count = paste0('A', cumsum(Type == 'A'),
'R', cumsum(Type == 'R'),
'U', cumsum(Type == 'U'))) %>%
ungroup()
Which yields
# A tibble: 9 x 4
ID Set Type Count
<int> <int> <chr> <chr>
1 1 1 A A1R0U0
2 2 1 R A1R1U0
3 3 1 R A1R2U0
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1R0U0
8 8 3 R A0R1U0
9 9 3 R A0R2U0
If you want to omit the variables with count zero, you'd need to wrap a function around it like so
mygroup <- function(lst) {
name <- names(lst)
vectors <- lapply(seq_along(lst), function(i) {
x <- lst[[i]]
char <- name[i]
x <- ifelse(x == 0, "", paste0(char, x))
return(x)
})
return(do.call("paste0", vectors))
}
mystart %>%
group_by(Set) %>%
mutate(Count = mygroup(list(A = cumsum(Type == 'A'),
R = cumsum(Type == 'R'),
U = cumsum(Type == 'U')))) %>%
ungroup()
This yields
# A tibble: 9 x 4
ID Set Type Count
<int> <int> <chr> <chr>
1 1 1 A A1
2 2 1 R A1R1
3 3 1 R A1R2
4 4 1 U A1R2U1
5 5 1 U A1R2U2
6 6 1 U A1R2U3
7 7 2 A A1
8 8 3 R R1
9 9 3 R R2

One line solve with data.table
you gotta first do
require(data.table)
mystart <- as.data.table(mystart)
then just use one line
mystart[, .(Type,
count = paste0(
'A',
cumsum(Type == 'A'),
'R',
countR = cumsum(Type == 'R'),
'U',
countU = cumsum(Type == 'U')
)),
by = c('Set')]
first you want cumsum each type and paste them together by 'set'
cumsum(Type=='A') equals the count, since when Type==A, it's 1, otherwise it's 0.
you wanted to paste them into one column also. So, paste0() is good to use.
you still wanted the Type column, so I included Type in the line.
The output:
Set Type count
1: 1 A A1R0U0
2: 1 R A1R1U0
3: 1 R A1R2U0
4: 1 U A1R2U1
5: 1 U A1R2U2
6: 1 U A1R2U3
7: 2 A A1R0U0
8: 3 R A0R1U0
9: 3 R A0R2U0
Hope this helps.
btw, if you want count 0 ignored, you gotta design some if-esle clause yourself.
basically you want this: if cumsum(something) ==0, NULL, esle paste0('something', cumsum(something)), then you paste0() them together.
It's gonna get nasty, I'm not writing it. you get the idea

Here's a base solution.
We can paste raw letters toseq_along of letter groups to get the last 2 characters, then paste the result to the last element of the previous result, using Reduce.
On top of this we use ave to compute by group.
fun <- function(x,y) paste0(x[length(x)],y,seq_along(y))
mystart$Count <- ave(as.character(mystart$Type),mystart$Set,
FUN = function(x) unlist(Reduce(fun,split(x,x),init=NULL,accumulate = TRUE)))
# ID Set Type Count
# 1 1 1 A A1
# 2 2 1 R A1R1
# 3 3 1 R A1R2
# 4 4 1 U A1R2U1
# 5 5 1 U A1R2U2
# 6 6 1 U A1R2U3
# 7 7 2 A A1
# 8 8 3 R R1
# 9 9 3 R R2
Details
split(x,x) splits letters as shown here for first Set:
with(subset(mystart,Set==1),split(Type,Type))
# $A
# [1] "A"
#
# $R
# [1] "R" "R"
#
# $U
# [1] "U" "U" "U"
Then fun does this type of operations, helped by Reduce :
fun(NULL,"A") # [1] "A1"
fun("A1",c("R","R")) # [1] "A1R1" "A1R2"
fun(c("A1R1","A1R2"),c("U","U","U")) # [1] "A1R2U1" "A1R2U2" "A1R2U3"
Bonus solution
This other base solution, using rle and avoiding split gives the same output for given example (and whenever Type values are grouped in Sets), but not with mystart2 <- rbind(mystart,mystart) for instance.
fun2 <- function(x){
rle_ <- rle(x)
suffix <- paste0(x,sequence(rle_$length))
prefix <- unlist(mapply(rep,
lag(unlist(
Reduce(paste0,paste0(rle_$values,rle_$lengths),accumulate=TRUE)
),rle_$lengths[1]),
each=rle_$lengths))
prefix[is.na(prefix)] <- ""
paste0(prefix,suffix)
}
mystart$Count2 <-ave(as.character(mystart$Type), mystart$Set,FUN=fun2)

Many elegant solutions have been provided for the problem. Still I was looking for something dplyr way (without-cumsum on fixed types). The function is generic enough to handle additional values of Type.
A solution with help of a custom function as:
library(dplyr)
mystart %>% group_by(Set, Type) %>%
mutate(type_count = row_number()) %>%
mutate(TypeMod = paste0(Type,type_count)) %>%
group_by(Set) %>%
mutate(Count = cumCat(TypeMod, type_count)) %>%
select(-type_count, -TypeMod)
cumCat <- function(x, y){
retVal <- character(length(x))
prevVal = ""
lastGrpVal = ""
for ( i in seq_along(x)){
if(y[i]==1){
lastGrpVal = prevVal
}
retVal[i] = paste0(lastGrpVal,x[i])
prevVal = retVal[i]
}
retVal
}
# # Groups: Set [3]
# ID Set Type Count
# <int> <int> <fctr> <chr>
# 1 1 1 A A1
# 2 2 1 R A1R1
# 3 3 1 R A1R2
# 4 4 1 U A1R2U1
# 5 5 1 U A1R2U2
# 6 6 1 U A1R2U3
# 7 7 2 A A1
# 8 8 3 R R1
# 9 9 3 R R2

Related

How to change the names of columns of a list of dataframes?

I have this list of dataframes created as follows :
df = data.frame(x = c(1,0,0,0,1,1,1,NA), y = c(2,2,2,2,3,3,2,NA),
z = c(1:7,NA), m = c(1,2,3,1,2,3,1,NA) )
df$x = factor(df$x)
df$y = factor(df$y)
df$m = factor(df$m)
l1 = list(df$x,df$y,df$m)
l2 = lapply(l1,table)
l3 = lapply(l2,as.data.frame)
l3
The output is as follows :
[[1]]
Var1 Freq
1 0 3
2 1 4
[[2]]
Var1 Freq
1 2 5
2 3 2
[[3]]
Var1 Freq
1 1 3
2 2 2
3 3 2
I wish that the names of the variables from the dataframe are assigned autmatically to the l3 list elements. For example : Var1 from list 1 becomes x. Var1 from list 2 becomes y. Var1 from list 3 becomes m. Thanks

Using Map.
l3 <- lapply(df, table) |> lapply(as.data.frame)
(l3 <- Map(\(x, y) {names(x)[1] <- y; x}, l3, names(l3)))
# $x
# x Freq
# 1 0 3
# 2 1 4
#
# $y
# y Freq
# 1 2 5
# 2 3 2
#
# $z
# z Freq
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 1
# 5 5 1
# 6 6 1
# 7 7 1
#
# $m
# m Freq
# 1 1 3
# 2 2 2
# 3 3 2
Data:
df <- structure(list(x = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 2L, NA
), levels = c("0", "1"), class = "factor"), y = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 1L, NA), levels = c("2", "3"), class = "factor"),
z = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, NA), m = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, NA), levels = c("1", "2", "3"), class = "factor")), row.names = c(NA,
-8L), class = "data.frame")

One possible solution:
Map(\(x,y) setNames(x, c(y,"Freq")), l3, c("x", "y", "z"))
[[1]]
x Freq
1 0 3
2 1 4
[[2]]
y Freq
1 2 5
2 3 2
[[3]]
z Freq
1 1 3
2 2 2
3 3 2

How can I calculate the sum of the column wise differences using dplyr

Despite using R and dplyr on a regular basis, I encountered the issue of not being able to calculate the sum of the absolute differences between all columns:
sum_diff=ABS(A-B)+ABS(B-C)+ABS(C-D)...
A
B
C
D
sum_diff
1
2
3
4
3
2
1
3
4
4
1
2
1
1
2
4
1
2
1
5
I know I could iterate using a for loop over all columns, but given the size of my data frame, I prefer a more elegant and fast solution.
Any help?
Thank you

We may remove the first and last columns, get the difference, and use rowSums on the absolute values in base R. This could be very efficient compared to a package solution
df1$sum_diff <- rowSums(abs(df1[-ncol(df1)] - df1[-1]))
-output
> df1
A B C D sum_diff
1 1 2 3 4 3
2 2 1 3 4 4
3 1 2 1 1 2
4 4 1 2 1 5
Or another option is rowDiffs from matrixStats
library(matrixStats)
rowSums(abs(rowDiffs(as.matrix(df1))))
[1] 3 4 2 5
data
df1 <- structure(list(A = c(1L, 2L, 1L, 4L), B = c(2L, 1L, 2L, 1L),
C = c(3L, 3L, 1L, 2L), D = c(4L, 4L, 1L, 1L)), row.names = c(NA,
-4L), class = "data.frame")

Daata from akrun (many thanks)!
This is complicated the idea is to generate a list of the combinations, I tried it with combn but then I get all possible combinations. So I created by hand.
With this combinations we then could use purrrs map_dfc and do some data wrangling after that:
library(tidyverse)
combinations <-list(c("A", "B"), c("B", "C"), c("C","D"))
purrr::map_dfc(combinations, ~{df <- tibble(a=data[[.[[1]]]]-data[[.[[2]]]])
names(df) <- paste0(.[[1]],"_v_",.[[2]])
df}) %>%
transmute(sum_diff = rowSums(abs(.))) %>%
bind_cols(data)
sum_diff A B C D
<dbl> <int> <int> <int> <int>
1 3 1 2 3 4
2 4 2 1 3 4
3 2 1 2 1 1
4 5 4 1 2 1
data:
data <- structure(list(A = c(1L, 2L, 1L, 4L), B = c(2L, 1L, 2L, 1L),
C = c(3L, 3L, 1L, 2L), D = c(4L, 4L, 1L, 1L)), row.names = c(NA,
-4L), class = "data.frame")

Here is a dplyrs version of #akrun's elegant aproach that calculates the diff of the dataframe with it's shifted variant:
df %>%
mutate(sum_diff = rowSums(abs(identity(.) %>% select(1:last_col(1))
- identity(.) %>% select(2:last_col()))))
And here we have the rowwise variant, which basicly follows the same idea but this time every row is used as a vector that get's substracted by it's shifted self.
df %>%
rowwise() %>%
mutate(sum_diff = map2_int(c_across(1:last_col(1)),
c_across(2:last_col()),
~ abs(.x - .y)) %>% sum())

Counting the elements in rows and map to column in r

I would like summarize my data by counting the entities and create counting_column for each entity.
let say:
df:
id class
1 A
1 B
1 A
1 A
1 B
1 c
2 A
2 B
2 B
2 D
I want to create a table like
id A B C D
1 3 2 1 0
2 1 2 0 1
How can I do this in R using apply function?

df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
class = structure(c(1L, 2L, 1L, 1L, 2L, 3L, 1L, 2L, 2L, 4L
), .Label = c("A", "B", "C", "D"), class = "factor")), .Names = c("id",
"class"), class = "data.frame", row.names = c(NA, -10L))
with(df, table(id, class))
# class
#id A B C D
# 1 3 2 1 0
# 2 1 2 0 1
xtabs(~ id + class, df)
# class
#id A B C D
# 1 3 2 1 0
# 2 1 2 0 1
tapply(rep(1, nrow(df)), df, length, default = 0)
# class
#id A B C D
# 1 3 2 1 0
# 2 1 2 0 1

This seems like a very strange requirement but if you insist on using apply then the function count counts the number of rows for which id equals x and class equals y. It is applied to every combination of id and class to get a using nested apply calls. Finally we add the row and column names.
uid <- unique(DF$id)
uclass <- unique(DF$class)
count <- function(x, y, DF) sum(x == DF$id & y == DF$class)
a <- apply(matrix(uclass), 1, function(u) apply(matrix(uid), 1, count, u, DF))
dimnames(a) <- list(uid, uclass)
giving:
> a
A B c D
1 3 2 1 0
2 1 2 0 1
Note
We used this for DF
Lines <- "id class
1 A
1 B
1 A
1 A
1 B
1 c
2 A
2 B
2 B
2 D"
DF <- read.table(text = Lines, header = TRUE)

R - add column that counts sequentially within groups but repeats for duplicates

I'm looking for a solution to add the column "desired_result" preferably using dplyr and/or ave(). See the data frame here, where the group is "section" and the unique instances I want my "desired_results" column to count sequentially are in "exhibit":
structure(list(section = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), exhibit = structure(c(1L,
2L, 3L, 3L, 1L, 2L, 2L, 3L), .Label = c("a", "b", "c"), class = "factor"),
desired_result = c(1L, 2L, 3L, 3L, 1L, 2L, 2L, 3L)), .Names = c("section",
"exhibit", "desired_result"), class = "data.frame", row.names = c(NA,
-8L))

dense_rank it is
library(dplyr)
df %>%
group_by(section) %>%
mutate(desire=dense_rank(exhibit))
# section exhibit desired_result desire
#1 1 a 1 1
#2 1 b 2 2
#3 1 c 3 3
#4 1 c 3 3
#5 2 a 1 1
#6 2 b 2 2
#7 2 b 2 2
#8 2 c 3 3

I've recently pushed a function rleid() to data.table (currently available on the development version, 1.9.5), which does exactly this. If you're interested, you can install it by following this.
require(data.table) # 1.9.5, for `rleid()`
require(dplyr)
DF %>%
group_by(section) %>%
mutate(desired_results=rleid(exhibit))
# section exhibit desired_result desired_results
# 1 1 a 1 1
# 2 1 b 2 2
# 3 1 c 3 3
# 4 1 c 3 3
# 5 2 a 1 1
# 6 2 b 2 2
# 7 2 b 2 2
# 8 2 c 3 3

If exact enumeration is necessary and you need the desired result to be consistent (so that a same exhibit in a different section will always have the same number), you can try:
library(dplyr)
df <- data.frame(section = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
exhibit = c('a', 'b', 'c', 'c', 'a', 'b', 'b', 'c'))
if (is.null(saveLevels <- levels(df$exhibit)))
saveLevels <- sort(unique(df$exhibit)) ## or levels(factor(df$exhibit))
df %>%
group_by(section) %>%
mutate(answer = as.integer(factor(exhibit, levels = saveLevels)))
## Source: local data frame [8 x 3]
## Groups: section
## section exhibit answer
## 1 1 a 1
## 2 1 b 2
## 3 1 c 3
## 4 1 c 3
## 5 2 a 1
## 6 2 b 2
## 7 2 b 2
## 8 2 c 3
If/when a new exhibit appears in subsequent sections, they should get newly enumerated results. (Notice the last exhibit is different.)
df2 <- data.frame(section = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
exhibit = c('a', 'b', 'c', 'c', 'a', 'b', 'b', 'd'))
if (is.null(saveLevels2 <- levels(df2$exhibit)))
saveLevels2 <- sort(unique(df2$exhibit))
df2 %>%
group_by(section) %>%
mutate(answer = as.integer(factor(exhibit, levels = saveLevels2)))
## Source: local data frame [8 x 3]
## Groups: section
## section exhibit answer
## 1 1 a 1
## 2 1 b 2
## 3 1 c 3
## 4 1 c 3
## 5 2 a 1
## 6 2 b 2
## 7 2 b 2
## 8 2 d 4

Complex data frame transposition in R

I've tried searching for an answer for this but most data.frame/matrix transpoitions aren't as complicated as I am trying to accomplish. Basically I have a data.frame which looks like
F M A
2008_b 1 5 6
2008_r 3 3 6
2008_a 4 1 5
2009_b 1 1 2
2009_r 5 4 9
2009_a 2 2 4
I'm trying to transpose it and rename the column and row names as such:
F_b M_b A_b F_r M_r A_r F_a M_a A_a
2008 1 5 6 3 3 6 4 1 5
2009 1 1 2 5 4 9 2 2 4
Essentially every three rows are being collapsed in to a single row. I assume this can be done with some clever plyr or reshape2 commands but I'm at a total loss how to accomplish it.

You could try
library(dplyr)
library(tidyr)
lvl <- c(outer(colnames(df), unique(gsub(".*_", "", rownames(df))),
FUN=paste, sep="_"))
res <- cbind(Var1=row.names(df), df) %>%
gather(Var2, value, -Var1) %>%
separate(Var1, c('Var11', 'Var12')) %>%
unite(VarN, Var2, Var12) %>%
mutate(VarN=factor(VarN, levels=lvl)) %>%
spread(VarN, value)
row.names(res) <- res[,1]
res1 <- res[,-1]
res1
# F_b M_b A_b F_r M_r A_r F_a M_a A_a
#2008 1 5 6 3 3 6 4 1 5
#2009 1 1 2 5 4 9 2 2 4
data
df <- structure(list(F = c(1L, 3L, 4L, 1L, 5L, 2L), M = c(5L, 3L, 1L,
1L, 4L, 2L), A = c(6L, 6L, 5L, 2L, 9L, 4L)), .Names = c("F",
"M", "A"), class = "data.frame", row.names = c("2008_b", "2008_r",
"2008_a", "2009_b", "2009_r", "2009_a"))