Count strings with a certain condition - r

I have the following dataset
#mydata
Factors Transactions
a,c 2
b 0
c 0
d,a 0
a 1
a 0
b 1
I'd like to count those factors who had transactions.For example, we had two times "a" with transaction. I can write a code to give me my desirable outcome for each variable separately. The following is for "a".
nrow (subset (mydata,mydata$Transaction > 0 & length(mydata[grep("a", mydata$Factors),] )> 0))
But I have too much variables and do not want to repeat a code for all of them. I would think there should be a way to write a code to give me the results for all of the variables. I wish to have the following out put:
#Output
a 2
b 1
c 1
d 0

Equivalent data.table option:
library(data.table)
setDT(df)[, .(Factors = unlist(strsplit(as.character(Factors), ","))),
by = Transactions][,.(Transactions = sum(Transactions > 0)), by = Factors]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0

You could create a table using the unique values of the Factor column as the levels. Consider df to be your data set.
s <- strsplit(as.character(df$Factors), ",", fixed = TRUE)
table(factor(unlist(s[df$Transactions > 0]), levels = unique(unlist(s))))
#
# a c b d
# 2 1 1 0
Wrap in as.data.frame() for data frame output.
with(df, {
s <- strsplit(as.character(Factors), ",", fixed = TRUE)
f <- factor(unlist(s[Transactions > 0]), levels = unique(unlist(s)))
as.data.frame(table(Factors = f))
})
# Factors Freq
# 1 a 2
# 2 c 1
# 3 b 1
# 4 d 0

With tidyverse packages, assuming your data is strings/factors and numbers,
library(tidyr)
library(dplyr)
# separate factors with two elements
df %>% separate_rows(Factors) %>%
# set grouping for aggregation
group_by(Factors) %>%
# for each group, count how many transactions are greater than 0
summarise(Transactions = sum(Transactions > 0))
## # A tibble: 4 x 2
## Factors Transactions
## <chr> <int>
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
You could also avoid dplyr by using xtabs, though some cleaning is necessary to get to the same arrangement:
library(tidyr)
df %>% separate_rows(Factors) %>%
xtabs(Transactions > 0 ~ Factors, data = .) %>%
as.data.frame() %>%
setNames(names(df))
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
A full base R equivalent:
df2 <- do.call(rbind,
Map(function(f, t){data.frame(Factors = strsplit(as.character(f), ',')[[1]],
Transactions = t)},
df$Factors, df$Transactions))
df3 <- as.data.frame(xtabs(Transactions > 0 ~ Factors, data = df2))
names(df3) <- names(df)
df3
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0

We can use cSplit from splitstackshape to split the 'Factors' into 'long' format and grouped by 'Factors' we get the sum of logical column ('Transactions > 0`).
library(splitstackshape)
cSplit(df1, "Factors", ",", "long")[, .(Transactions=sum(Transactions > 0)),.(Factors)]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
Or using base R
with(df1, table(factor(unlist(strsplit(Factors[Transactions>0], ",")),
levels = letters[1:4]) ))
# a b c d
# 2 1 1 0
data
df1 <- structure(list(Factors = c("a,c", "b", "c", "d,a", "a", "a",
"b"), Transactions = c(2L, 0L, 0L, 0L, 1L, 0L, 1L)), .Names = c("Factors",
"Transactions"), class = "data.frame", row.names = c(NA, -7L))

Related

How to get sum by each factor level?

I have filtered data and one of the columns has 5 factor levels and I want to get sum for each of the factor level.
I am using the below code
levels(df_Temp$ATYPE)
[1] "a" "b" "c" "d" "Unknown"
I am using the below code
cast(df_Temp,ATYPE~AFTER_ADM, sum, value = "CHRGES")
but the output I am getting is as below
ATYPE 0 1
1 a 0 2368968.39
2 b 0 3206567.47
3 c 0 19551.19
4 e 0 2528688.12
I want to all the factor levels and sum as "0" for those missing data of factors level.
So the desired output is
ATYPE 0 1
1 a 0 2368968.39
2 b 0 3206567.47
3 c 0 19551.19
4 d 0 0
5 e 0 2528688.12
Using xtabs from base R
xtabs(CHRGES ~ ATYPE + AFTER_ADM, subset(df_Temp, ATYPE != "e"))
# AFTER_ADM
#ATYPE 0 1
# a 0.00000000 -5.92270971
# b -1.68910431 0.05222349
# c -0.26869311 0.16922669
# d 1.44764443 -1.59011411
# e 0.00000000 0.00000000
data
set.seed(24)
df_Temp <- data.frame(ATYPE = sample(letters[1:5], 20, replace = TRUE),
AFTER_ADM = sample(0:1, 20, replace = TRUE), CHRGES = rnorm(20))
If I understand your question correctly, you can use dplyr. First I created an example dataset:
set.seed(123)
x <- sample(letters[1:5], 1e3, replace = T)
x[x == "e"] <- "Unknown"
y <- sample(1:100, 1e3, replace = T)
df1 <- data.frame(ATYPE = factor(x), AFTER_ADM = y)
df1$AFTER_ADM[df1$ATYPE == "Unknown"] <- NA
head(df1, 10)
ATYPE AFTER_ADM
1 b 28
2 d 60
3 c 17
4 Unknown NA
5 Unknown NA
6 a 48
7 c 78
8 Unknown NA
9 c 7
10 c 45
And then use group_by and summarise to get the sum and the counts. I was not sure if you would want the counts for the factor levels but it is easy to take out if you are not interested:
library(dplyr)
df1 %>%
group_by(ATYPE) %>%
summarise(sum_AFTER_ADM = sum(AFTER_ADM, na.rm = T),
n_ATYPE = n())
# A tibble: 5 x 3
ATYPE sum_AFTER_ADM n_ATYPE
<fct> <int> <int>
1 a 10363 198
2 b 11226 206
3 c 9611 203
4 d 9483 195
5 Unknown 0 198
Another possible solution using dplyr and tidyr. Using count and complete from the two packages will help solve your problem.
library(dplyr)
library(tidyr)
#using iris as toy data
iris2 <- iris %>%
filter(Species != "setosa")
#count data and then fill n with 0
ir3 <- count(iris2, Species) %>%
complete(Species, fill = list(n =0))

Recursively sum data frames for matching rows

I would like to combine a set of data frames into a single data frame by summing columns that have matching variables (instead of appending columns).
For example, given
df1 <- data.frame(A = c(0,0,1,1,1,2,2), B = c(1,2,1,2,3,1,5), x = c(2,3,1,5,3,7,0))
df2 <- data.frame(A = c(0,1,1,2,2,2), B = c(1,1,3,2,4,5), x = c(4,8,4,1,0,3))
df3 <- data.frame(A = c(0,1,2), B = c(5,4,2), x = c(5,3,1))
I want to match by "A" and "B" and sum the values of "x". For this example, I can get the desired result as follows:
library(plyr)
library(dplyr)
# rename columns so that join_all preserves them all:
colnames(df1)[3] <- "x1"
colnames(df2)[3] <- "x2"
colnames(df3)[3] <- "x3"
# join the data frames by matching "A" and "B" values:
res <- join_all(list(df1, df2, df3), by = c("A", "B"), type = "full")
# get the sums and drop superfluous columns:
arrange(res, A, B) %>%
rowwise() %>%
mutate(x = sum(x1, x2, x3, na.rm = TRUE)) %>%
select(A, B, x)
Result:
A B x
<dbl> <dbl> <dbl>
1 0 1 6
2 0 2 3
3 0 5 5
4 1 1 9
5 1 2 5
6 1 3 7
7 1 4 3
8 2 1 7
9 2 2 2
10 2 4 0
11 2 5 3
A more general solution is
library(dplyr)
# function to get the desired result for two data frames:
my_merge <- function(df1, df2)
{
m1 <- merge(df1, df2, by = c("A", "B"), all = TRUE)
m1 <- rowwise(res) %>%
mutate(x = sum(x.x, x.y, na.rm = TRUE)) %>%
select(A, B, x)
return(m1)
}
l1 <- list(df2, df3) # omit the first data frame
res <- df1 # initial value of the result
for(df in l1) res <- my_merge(res, df) # call the function repeatedly
Is there a more efficient option for combining a large set of data frames? Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
An easier option is to bind the rows of the datasets, then group by the columns of interest and get the summarised output by getting the sum of 'x'
library(tidyverse)
bind_rows(df1, df2, df3) %>%
group_by(A, B) %>%
summarise(x = sum(x))
# A tibble: 11 x 3
# Groups: A [?]
# A B x
# <dbl> <dbl> <dbl>
# 1 0 1 6
# 2 0 2 3
# 3 0 5 5
# 4 1 1 9
# 5 1 2 5
# 6 1 3 7
# 7 1 4 3
# 8 2 1 7
# 9 2 2 2
#10 2 4 0
#11 2 5 3
If there are many objects in the global environment with the pattern "df" followed by some digits
mget(ls(pattern= "^df\\d+")) %>%
bind_rows %>%
group_by(A, B) %>%
summarise(x = sum(x))
As the OP mentioned about memory constraints, if we do the join first and then use rowSums or + with reduce, it would be more efficient
mget(ls(pattern= "^df\\d+")) %>%
reduce(full_join, by = c("A", "B")) %>%
transmute(A, B, x = rowSums(.[3:5], na.rm = TRUE)) %>%
arrange(A, B)
# A B x
#1 0 1 6
#2 0 2 3
#3 0 5 5
#4 1 1 9
#5 1 2 5
#6 1 3 7
#7 1 4 3
#8 2 1 7
#9 2 2 2
#10 2 4 0
#11 2 5 3
This could also be done with data.table
library(data.table)
rbindlist(mget(ls(pattern= "^df\\d+")))[, .(x = sum(x)), by = .(A, B)]
Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
If you're memory constrained and willing to sacrifice speed (vs #akrun's data.table approach), use one table at a time in a loop:
library(data.table)
tabs = c("df1", "df2", "df3")
# enumerate all combos for the results table
# initializing sum to 0
res = CJ(A = 0:2, B = 1:5, x = 0)
# loop over tabs, adding on
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res[tab, on=.(A, B), x := x + i.x][]
rm(tab)
}
If you need to read tables from disk, change tabs to file names and get to fread or whatever function.
I am skeptical that you can fit all the tables in memory, but cannot also fit an rbind-ed copy of them together.
Similarly (thanks to #akrun's comment), use his approach pairwise:
res = data.table(get(tabs[[1]]))[0L]
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res = rbind(res, tab)[, .(x = sum(x)), by=.(A,B)]
rm(tab)
}

R sum of aggregate columns found in another column

Given this data, the first 4 columns (rowid, order, line, special), I need to create a column, numSpecial as such:
rowid order line special numSpecial
1 A 01 X 1
2 B 01 0
3 B 02 X 2
4 B 03 X 2
5 C 01 X 1
6 C 02 0
Where numSpecial is determined by summing the number of times for each order that is special (value = X), given that order-line is special itself, otherwise its 0.
I first tried adding a column that simply concats 'order' with 'X', call it orderX, and would look like:
orderX
AX
BX
BX
BX
CX
CX
Then do a sum of order & special in orderx:
df$numSpecial <- sum(paste(order, special, sep = "") %in% orderx)
But that doesnt work, it returns the sum of the results for all rows for every order:
numSpecial
4
4
4
4
4
4
I then tried as.data.table, but I'm not getting the expected results using:
as.data.table(mydf)[, numSpecial := sum(paste(order, special, sep = "") %in% orderx), by = rowid]
However that is returning just 1 for each row and not sums:
numSpecial
1
0
1
1
1
0
Where am I going wrong with these? I shouldn't have to create that orderX column either I don't think, but I can't figure out the way to get this count right. It's similar to a countif in excel which is easy to do.
There's probably several ways, but you could just multiply it by a TRUE/FALSE flag of "X" being present:
dat[, numSpecial := sum(special == "X") * (special == "X"), by=order]
dat
# rowid order line special numSpecial
#1: 1 A 1 X 1
#2: 2 B 1 0
#3: 3 B 2 X 2
#4: 4 B 3 X 2
#5: 5 C 1 X 1
#6: 6 C 2 0
You could also do it a bit differently like:
dat[, numSpecial := 0L][special == "X", numSpecial := .N, by=order]
Where dat was:
library(data.table)
dat <- structure(list(rowid = 1:6, order = c("A", "B", "B", "B", "C",
"C"), line = c(1L, 1L, 2L, 3L, 1L, 2L), special = c("X", "",
"X", "X", "X", "")), .Names = c("rowid", "order", "line", "special"
), row.names = c(NA, -6L), class = "data.frame")
setDT(dat)
You could use ave with a dummy variable (just filled with 1s):
df$numSpecial <- ifelse(df$special == "X", ave(rep(1,nrow(df)), df$order, df$special, FUN = length), 0)
df
# rowid order line special numSpecial
#1 1 A 1 X 1
#2 2 B 1 0
#3 3 B 2 X 2
#4 4 B 3 X 2
#5 5 C 1 X 1
#6 6 C 2 0
Note I read in your data without the numSpecial column.
Using the dplyr package:
library(dplyr)
df %>% group_by(order) %>%
mutate(numSpecial = ifelse(special=="X", sum(special=="X"), 0))
rowid order special numSpecial
1 1 A X 1
2 2 B 0
3 3 B X 2
4 4 B X 2
5 5 C X 1
6 6 C 0
One other option using base R only would be to use aggregate:
# Your data
df <- data.frame(rowid = 1:6, order = c("A", "B", "B", "B", "C", "C"), special = c("X", "", "X", "X", "X", ""))
# Make the counts
dat <- with(df,aggregate(x=list(answer=special),by=list(order=order,special=special),FUN=function(x) sum(x=="X")))
# Merge back to original dataset:
dat.fin <- merge(df,dat,by=c('order','special'))

how to get quick summary of count in data.table

This is a part of feature engineering that summarizes each ID depending on column called Col. The same preprocess will be applied to the testing set. Since the data set is large, data.table based solution may be more preferred.
Training Input:
ID Col
A M
A M
A M
B K
B M
Expected output for above training input:
ID Col_M Col_K
A 3 0 # A has 3 M in Col and 0 K in Col
B 1 1
Above is for processing training data. For testing dataset, if requires to mapping over Col_M, Col_K, meaning, if other value like S appearing in Col, it will be ignored.
Testing Input:
ID Col
C M
C S
Expected output for above testing input:
ID Col_M Col_K
C 1 0 # A has 1 M in Col and 0 K in Col. S value is ignored
A possible data.table implementation could be first filter by c("M", "K"), then add these level (in case they aren't present like in your second case), then running dcast while specifying drop = FALSE, fill = 0L (for the cases when one of the desired levels is missing) while specifying fun = length (in order to count).
Testing on both data sets
library(data.table)
### First example
df <- fread("ID Col
A M
A M
A M
B K
B M")
dcast(df[Col %in% c("M", "K")], # Work only with c("M", "K")
ID ~ factor(Col, levels = union(unique(Col), c("M", "K"))), # Add missing levels
drop = FALSE, # Keep missing levels in output
fill = 0L, # Fill missing values with zeroes instead of NAs
fun = length) # Count. you can also specify 'value.var'
# ID M K
# 1: A 3 0
# 2: B 1 1
### Second example
df <- fread("ID Col
C M
C S")
dcast(df[Col %in% c("M", "K")],
ID ~ factor(Col, levels = union(unique(Col), c("M", "K"))),
drop = FALSE,
fill = 0L,
fun = length)
# ID M K
# 1: C 1 0
I am not sure how big is your data and how flexible the expected code should be, but I have this:
zz = '
ID Col
A M
A M
A M
B K
B M
'
df <- read.table(text = zz, header = TRUE)
col = as.data.frame(table(df))
out <- reshape(col, idvar = "ID",
timevar = "Col", direction = "wide")
out
which gives you:
> out
ID Freq.K Freq.M
1 A 0 3
2 B 1 1
And for the second data frame:
yy = '
ID Col
C M
C S
'
df1 <- read.table(text = yy, header = TRUE)
col1 = as.data.frame(table(df1))
out1 <- reshape(col1, idvar = "ID",
timevar = "Col", direction = "wide")
out1
you get:
> out1
ID Freq.M Freq.S
1 C 1 1
Then just merge them together and delete the redundant:
ss = merge(out1, out, all.y = T, all.x = T)
ss
ID Freq.M Freq.S Freq.K
1 C 1 1 NA
2 A 3 NA 0
3 B 1 NA 1
> library(data.table)
> dt=NULL
> dt$ID=c("A","A","A","B","B")
> dt$Col=c("M","M","M","K","M")
> dt=data.frame(dt)
> dt=data.table(dt)
> dt
ID Col
1: A M
2: A M
3: A M
4: B K
5: B M
> a=dt[Col=="M",sum(.N),ID]
> b=dt[Col=="K",sum(.N),ID]
> a
ID V1
1: A 3
2: B 1
> b
ID V1
1: B 1
> setkey(a,ID)
> setkey(b,ID)
> m=b[a]
> m
ID V1 i.V1
1: A NA 3
2: B 1 1
> names(m)=c("ID","Col_K","Col_M")
> m
ID Col_K Col_M
1: A NA 3
2: B 1 1

Add (not merge!) two data frames with unequal rows and columns

I want to efficiently sum the entries of two data frames, though the data frames are not guaranteed to have the same dimensions or column names. Merge isn't really what I'm after here. Instead I want to create an output object with all of the row and column names that belong to either of the added data frames. In each position of that output, I want to use the following logic for the computed value:
If a row/column pairing belongs to both input data frames I want the output to include their sum
If a row/column pairing belongs to just one input data frame I want to include that value in the output
If a row/column pairing does not belong to any input matrix I want to have 0 in that position in the output.
As an example, consider the following input data frames:
df1 = data.frame(x = c(1,2,3), y = c(4,5,6))
rownames(df1) = c("a", "b", "c")
df2 = data.frame(x = c(7,8), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
> df1
x y
a 1 4
b 2 5
c 3 6
> df2
x z w
a 7 9 2
d 8 10 3
I want the final result to be
> df2
x y z w
a 8 4 9 2
b 2 5 0 0
c 3 6 0 0
d 8 0 10 3
What I've done so far -
bind_rows / bind_cols in dplyr can throw the following:
"Error: incompatible number of rows (3, expecting 2)"
I have duplicated column names, so 'merge' isn't working for my purposes either - returns an empty df for some reason.
Seems like you could merge on the rownames, then take care of the sums and conversion of NA to zero with some additional munging:
library(dplyr)
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames, by="rowname") %>%
mutate_each(funs(replace(., which(is.na(.)), 0))) %>%
mutate(x = x.x + x.y) %>%
select(rowname,x,y,z,w)
Or, with #DavidArenburg's much more elegant and extensible solution:
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames) %>%
group_by(rowname) %>%
summarise_each(funs(sum(., na.rm = TRUE)))
df.new
rowname x y z w
1 a 8 4 9 2
2 b 2 5 0 0
3 c 3 6 0 0
4 d 8 0 10 3
This seems like some type of a simple merge on common column names (+ row names) and then a simple aggregation, this is how I would tackle this
library(data.table)
merge(setDT(df1, keep.rownames = TRUE), # Convert to data.table + keep rows
setDT(df2, keep.rownames = TRUE), # Convert to data.table + keep rows
by = intersect(names(df1), names(df2)), # merge on common column names
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn] # Sum all columns by group
# rn x y z w
# 1: a 8 4 9 2
# 2: b 2 5 0 0
# 3: c 3 6 0 0
# 4: d 8 0 10 3
Are a pretty straight forward base R solution
df1$rn <- row.names(df1)
df2$rn <- row.names(df2)
res <- merge(df1, df2, all = TRUE)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
First, I would grab the names of all the rows and columns of the new entity:
(all.rows <- unique(c(row.names(df1), row.names(df2))))
# [1] "a" "b" "c" "d"
(all.cols <- unique(c(names(df1), names(df2))))
# [1] "x" "y" "z" "w"
Then I would construct an output matrix with those rows and column names (with matrix data initialized to all 0s), adding df1 and df2 to the relevant parts of that matrix.
out <- matrix(0, nrow=length(all.rows), ncol=length(all.cols))
rownames(out) <- all.rows
colnames(out) <- all.cols
out[row.names(df1),names(df1)] <- unlist(df1)
out[row.names(df2),names(df2)] <- out[row.names(df2),names(df2)] + unlist(df2)
out
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
Using xtabs on melted / stacked data frames:
out <- rbind(cbind(rn=rownames(df1),stack(df1)), cbind(rn=rownames(df2),stack(df2)))
as.data.frame.matrix(xtabs(values ~ rn + ind, data=out))
# x y w z
#a 8 4 2 9
#b 2 5 0 0
#c 3 6 0 0
#d 8 0 3 10
I’m not convinced the accepted (or alternative merge) method is the best. It will give incorrect results if you have common rows, they’ll get joined and not summed.
This can be shown trivialy by changing df2 to:
df2 = data.frame(x = c(1,2), y = c(4,5), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
expected results:
rn x y z w
1: a 2 8 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
actual results
merge(setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE),
by = intersect(names(df1), names(df2)),
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn]
rn x y z w
1: a 1 4 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
You need to combine both the outer join with an inner join (or left/right joins, merge all=T/all=F). Or alternatively using plyr’s rbind.fill :
base R solution
res <- rbind.fill(df1,df2)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
data table solution
as.data.table(rbind.fill(
setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE)
))[, lapply(.SD, sum, na.rm = TRUE), by = rn]
I prefer the rbind.fill method as you can "merge" > 2 data frames using the same syntax.

Resources