R sum of aggregate columns found in another column - r

Given this data, the first 4 columns (rowid, order, line, special), I need to create a column, numSpecial as such:
rowid order line special numSpecial
1 A 01 X 1
2 B 01 0
3 B 02 X 2
4 B 03 X 2
5 C 01 X 1
6 C 02 0
Where numSpecial is determined by summing the number of times for each order that is special (value = X), given that order-line is special itself, otherwise its 0.
I first tried adding a column that simply concats 'order' with 'X', call it orderX, and would look like:
orderX
AX
BX
BX
BX
CX
CX
Then do a sum of order & special in orderx:
df$numSpecial <- sum(paste(order, special, sep = "") %in% orderx)
But that doesnt work, it returns the sum of the results for all rows for every order:
numSpecial
4
4
4
4
4
4
I then tried as.data.table, but I'm not getting the expected results using:
as.data.table(mydf)[, numSpecial := sum(paste(order, special, sep = "") %in% orderx), by = rowid]
However that is returning just 1 for each row and not sums:
numSpecial
1
0
1
1
1
0
Where am I going wrong with these? I shouldn't have to create that orderX column either I don't think, but I can't figure out the way to get this count right. It's similar to a countif in excel which is easy to do.

There's probably several ways, but you could just multiply it by a TRUE/FALSE flag of "X" being present:
dat[, numSpecial := sum(special == "X") * (special == "X"), by=order]
dat
# rowid order line special numSpecial
#1: 1 A 1 X 1
#2: 2 B 1 0
#3: 3 B 2 X 2
#4: 4 B 3 X 2
#5: 5 C 1 X 1
#6: 6 C 2 0
You could also do it a bit differently like:
dat[, numSpecial := 0L][special == "X", numSpecial := .N, by=order]
Where dat was:
library(data.table)
dat <- structure(list(rowid = 1:6, order = c("A", "B", "B", "B", "C",
"C"), line = c(1L, 1L, 2L, 3L, 1L, 2L), special = c("X", "",
"X", "X", "X", "")), .Names = c("rowid", "order", "line", "special"
), row.names = c(NA, -6L), class = "data.frame")
setDT(dat)

You could use ave with a dummy variable (just filled with 1s):
df$numSpecial <- ifelse(df$special == "X", ave(rep(1,nrow(df)), df$order, df$special, FUN = length), 0)
df
# rowid order line special numSpecial
#1 1 A 1 X 1
#2 2 B 1 0
#3 3 B 2 X 2
#4 4 B 3 X 2
#5 5 C 1 X 1
#6 6 C 2 0
Note I read in your data without the numSpecial column.

Using the dplyr package:
library(dplyr)
df %>% group_by(order) %>%
mutate(numSpecial = ifelse(special=="X", sum(special=="X"), 0))
rowid order special numSpecial
1 1 A X 1
2 2 B 0
3 3 B X 2
4 4 B X 2
5 5 C X 1
6 6 C 0

One other option using base R only would be to use aggregate:
# Your data
df <- data.frame(rowid = 1:6, order = c("A", "B", "B", "B", "C", "C"), special = c("X", "", "X", "X", "X", ""))
# Make the counts
dat <- with(df,aggregate(x=list(answer=special),by=list(order=order,special=special),FUN=function(x) sum(x=="X")))
# Merge back to original dataset:
dat.fin <- merge(df,dat,by=c('order','special'))

Related

Split df column of integers into individual digits in R

I have a df where one variable is an integer. I'd like to split this column into it's individual digits. See my example below
Group Number
A 456
B 3
C 18
To
Group Number Digit1 Digit2 Digit3
A 456 4 5 6
B 3 3 NA NA
C 18 1 8 NA
We can use read.fwf from base R. Find the max number of character (nchar) in 'Number' column (mx). Read the 'Number' column after converting to character (as.character), specify the 'widths' as 1 by replicating 1 with mx and assign the output to new 'Digit' columns in the data
mx <- max(nchar(df1$Number))
df1[paste0("Digit", seq_len(mx))] <- read.fwf(textConnection(
as.character(df1$Number)), widths = rep(1, mx))
-output
df1
# Group Number Digit1 Digit2 Digit3
#1 A 456 4 5 6
#2 B 3 3 NA NA
#3 C 18 1 8 NA
data
df1 <- structure(list(Group = c("A", "B", "C"), Number = c(456L, 3L,
18L)), class = "data.frame", row.names = c(NA, -3L))
Another base R option (I think #akrun's approach using read.fwf is much simpler)
cbind(
df,
with(
df,
type.convert(
`colnames<-`(do.call(
rbind,
lapply(
strsplit(as.character(Number), ""),
`length<-`, max(nchar(Number))
)
), paste0("Digit", seq(max(nchar(Number))))),
as.is = TRUE
)
)
)
which gives
Group Number Digit1 Digit2 Digit3
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Using splitstackshape::cSplit
splitstackshape::cSplit(df, 'Number', sep = '', stripWhite = FALSE, drop = FALSE)
# Group Number Number_1 Number_2 Number_3
#1: A 456 4 5 6
#2: B 3 3 NA NA
#3: C 18 1 8 NA
Updated
I realized I could use max function for counting characters limit in each row so that I could include it in my map2 function and save some lines of codes thanks to an accident that led to an inspiration by dear #ThomasIsCoding.
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
df %>%
rowwise() %>%
mutate(map2_dfc(Number, 1:max(nchar(Number)), ~ str_sub(.x, .y, .y))) %>%
unnest(cols = !c(Group, Number)) %>%
rename_with(~ str_replace(., "\\.\\.\\.", "Digit"), .cols = !c(Group, Number)) %>%
mutate(across(!c(Group, Number), as.numeric, na.rm = TRUE))
# A tibble: 3 x 5
Group Number Digit1 Digit2 Digit3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Data
df <- tribble(
~Group, ~Number,
"A", 456,
"B", 3,
"C", 18
)
Two base r methods:
no_cols <- max(nchar(as.character(df1$Number)))
# Using `strsplit()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(strsplit(as.character(df1$Number), ""),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
# Using `regmatches()` and `gregexpr()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(regmatches(df1$Number, gregexpr("\\d", df1$Number)),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))

detecting sequence by group and compute new variable for the subset

I need to detect a sequence by group in a data.frame and compute new variable.
Consider I have this following data.frame:
df1 <- data.frame(ID = c(1,1,1,1,1,1,1,2,2,2,3,3,3,3),
seqs = c(1,2,3,4,5,6,7,1,2,3,1,2,3,4),
count = c(2,1,3,1,1,2,3,1,2,1,3,1,4,1),
product = c("A", "B", "C", "C", "A,B", "A,B,C", "D", "A", "B", "A", "A", "A,B,C", "D", "D"),
stock = c("A", "A,B", "A,B,C", "A,B,C", "A,B,C", "A,B,C", "A,B,C,D", "A", "A,B", "A,B", "A", "A,B,C", "A,B,C,D", "A,B,C,D"))
df1
> df1
ID seqs count product stock
1 1 1 2 A A
2 1 2 1 B A,B
3 1 3 3 C A,B,C
4 1 4 1 C A,B,C
5 1 5 1 A,B A,B,C
6 1 6 2 A,B,C A,B,C
7 1 7 3 D A,B,C,D
8 2 1 1 A A
9 2 2 2 B A,B
10 2 3 1 A A,B
11 3 1 3 A A
12 3 2 1 A,B,C A,B,C
13 3 3 4 D A,B,C,D
14 3 4 1 D A,B,C,D
I am interested to compute a measure for ID that follow this sequence:
- Count == 1
- Count > 1
- Count == 1
In the example this is true for:
- rows 2, 3, 4 for `ID==1`
- rows 8, 9, 10 for `ID==2`
- rows 12, 13, 14 for `ID==3`
For these ID and rows, I need to compute a measure called new that takes the value of the product of the last row of the sequence if it is in the second row of the sequence and NOT in the stock of the first sequence.
The desired outcome is shown below:
> output
ID seq1 seq2 seq3 new
1 1 2 3 4 C
2 2 1 2 3
3 3 2 3 4 D
Note:
In the sequence detected for ID no new products are added to the stock.
In the original data there are a lot of IDs who do not have any sequences.
Some ID have multiple qualifying sequences. All should be recorded.
Count is always 1 or greater.
The original data holds millions of ID with up to 1500 sequences.
How would you write an efficient piece of code to get this output?
Here's a data.table option:
library(data.table)
char_cols <- c("product", "stock")
setDT(df1)[,
(char_cols) := lapply(.SD, as.character),
.SDcols = char_cols] # in case they're factors
df1[, c1 := (count == 1) &
(shift(count) > 1) &
(shift(count, 2L) == 1),
by = ID] #condition1
df1[, pat := paste0("(", gsub(",", "|", product), ")")] # pattern
df1[, c2 := mapply(grepl, pat, shift(product)) &
!mapply(grepl, pat, shift(stock, 2L)),
by = ID] # condition2
df1[(c1), new := ifelse(c2, product, "")] # create new column
df1[, paste0("seq", 1:3) := shift(seqs, 2:0)] # create seq columns
df1[(c1), .(ID, seq1, seq2, seq3, new)] # result
Here's another approach using tidyverse; however, I think lag and lead has made this solution a bit time-consuming. I included the comments within the code to make it more legible.
But I spent enough time on it, to post it anyway.
library(tidyverse)
df1 %>% group_by(ID) %>%
# this finds the row with count > 1 which ...
#... the counts of the row before and the one of after it equals to 1
mutate(test = (count > 1 & c(F, lag(count==1)[-1]) & c(lead(count==1)[-n()],F))) %>%
# this makes a column which has value of True for each chunk...
#that meets desired condition to later filter based on it
mutate(test2 = test | c(F,lag(test)[-1]) | c(lead(test)[-n()], F)) %>%
filter(test2) %>% ungroup() %>%
# group each three occurrences in case of having multiple ones within each ID
group_by(G=trunc(3:(n()+2)/3)) %>% group_by(ID,G) %>%
# creating new column with string extracting techniques ...
#... (assuming those columns are characters)
mutate(new=
str_remove_all(
as.character(regmatches(stock[2], gregexpr(product[3], stock[2]))),
stock[1])) %>%
# selecting desired columns and adding times for long to wide conversion
select(ID,G,seqs,new) %>% mutate(times = 1:n()) %>% ungroup() %>%
# long to wide conversion using tidyr (part of tidyverse)
gather(key, value, -ID, -G, -new, -times) %>%
unite(col, key, times) %>% spread(col, value) %>%
# making the desired order of columns
select(-G,-new,new) %>% as.data.frame()
# ID seqs_1 seqs_2 seqs_3 new
# 1 1 2 3 4 C
# 2 2 1 2 3
# 3 3 2 3 4 D

Check whether a vector element of one value is placed between vector elements of two other values in R

I did not find any method of checking whether categorical value elements of a vector are between other categorical value elements.
A dataframe is given:
id letter
1 B
2 A
3 B
4 B
5 C
6 B
7 A
8 B
9 C
Everything I found is related to numerical values and to the notion of general order (rather than to index of an element in a specific vector).
I want to add a new column with boolean values (1 if B is between A and C; 0 if B is between C and A) to the dataframe,
id letter between
1 B 0
2 A NA
3 B 1
4 B 1
5 C NA
6 B 0
7 A NA
8 B 1
9 C NA
A combination of rle (run length encoding) and zoo::rollapply is one option:
library(zoo)
d <- structure(list(id = 1:9,
letter = structure(c(2L, 1L, 2L, 2L, 3L, 2L, 1L, 2L, 3L),
.Label = c("A", "B", "C"),
class = "factor")),
class = "data.frame", row.names = c(NA, -9L))
rl <- rle(as.numeric(d$letter))
rep(rollapply(c(NA, rl$values, NA),
3,
function(x) if (x[2] == 2)
ifelse(x[1] == 1 && x[3] == 3, 1, 0)
else NA),
rl$lengths)
# [1] 0 NA 1 1 NA 0 NA 1 NA
Explanation
With rleyou identify blocks of consecutive values.
With rollapply you "roll" a function with a given window size (here 3) over a vector.
Our vector rl$values contains the different elements and the function we apply to it is pretty straight forward:
if the second element is anything but a 2 (corresponding to B) return NA
if the second element is a 2 and element 1 is an A and element 3 is a C return 1 and 0 otherwise
A different tidyverse possibility could be:
df %>%
group_by(grp = with(rle(letter), rep(seq_along(lengths), lengths))) %>%
filter(row_number() == 1) %>%
ungroup() %>%
mutate(res = ifelse(lag(letter, default = first(letter)) == "A" &
lead(letter, default = last(letter)) == "C", 1, 0)) %>%
select(-letter, -grp) %>%
full_join(df, by = c("id" = "id")) %>%
arrange(id) %>%
fill(res) %>%
mutate(res = ifelse(letter != "B", NA, res))
id res letter
<int> <dbl> <chr>
1 1 0 B
2 2 NA A
3 3 1 B
4 4 1 B
5 5 NA C
6 6 0 B
7 7 NA A
8 8 1 B
9 9 NA C
In this case it, first, groups by a run-length type ID and keeps the first rows with a given ID. Second, it checks the condition. Third, it performs a full join with the original df on "id" column. Finally, it arranges according "id", fills the missing values and assigns NA to rows where "letter" != B.
It's unclear from the question whether "A" and "C" must alternate, though that's implied because there is no coding for "B" between "A" and "A" or vv. Supposing that they do, for the vector
x = c("B", "A", "B", "B", "C", "B", "A", "B", "C")
map to numeric values c(A=1, B=0, C=-1) and form the cumulative sum
v = cumsum(c(A=1, B=0, C=-1)[x])
(increment by 1 when encountering "A", decrement by one when "C"). Replace positions not corresponding to "B" with NA
v[x != "B"] = NA
giving
> v
B A B B C B A B C
0 NA 1 1 NA 0 NA 1 NA
This could be captured as a function
fun = function(x, map = c(A = 1, B = 0, C = -1)) {
x = map[x]
v = cumsum(x)
v[x != 0] = NA
v
}
and used to transform a data.frame or tibble, e.g.,
tibble(x) %>% mutate(v = fun(x))
Here's one solution, which I hope is fairly easy conceptually. For 'special' cases such as B being at the top or bottom of the list, or having an A or a C on both sides, I've set such values to 0.
# Create dummy data - you use your own
df <- data.frame(id=1:100, letter=sample(c("A", "B", "C"), 100, replace=T))
# Copy down info on whether A or C is above each B
acup <- df$letter
for(i in 2:nrow(df))
if(df$letter[i] == "B")
acup[i] <- acup[i-1]
# Copy up info on whether A or C is below each B
acdown <- df$letter
for(i in nrow(df):2 -1)
if(df$letter[i] == "B")
acdown[i] <- acdown[i+1]
# Set appropriate values for column 'between'
df$between <- NA
df$between[acup == "A" & acdown == "C"] <- 1
df$between[df$letter == "B" & is.na(df$between)] <- 0 # Includes special cases
You can use lead and lag functions to know the letters before and after and then mutate as below:
library(dplyr)
df %>%
mutate(letter_lag = lag(letter, 1),
letter_lead = lead(letter, 1)) %>%
mutate(between = case_when(letter_lag == "A" | letter_lead == "C" ~ 1,
letter_lag == "C" | letter_lead == "A" ~ 0,
TRUE ~ NA_real_)) %>%
select(id, letter, between)
id letter between
1 1 B 0
2 2 A NA
3 3 B 1
4 4 B 1
5 5 C NA
6 6 B 0
7 7 A NA
8 8 B 1
9 9 C NA

How delete all rows that contain a certain value regardless of what column it is in

I need to delete all rows that contain a value of 2 or -2 regardless of what column it is in except column one.
Example dataframe:
df
a b c d
zzz 2 2 -1
yyy 1 1 1
xxx 1 -1 -2
Desired output:
df
a b c d
yyy 1 1 1
I have tried
df <- df[!grepl(-2 | 2, df),]
df <- subset(df, !df[-1] == 2 |!df[-1] == -2)
My actual dataset has over 300 rows and 70 variables
I believe I need to use some sort of apply function but I am not sure.
Any help is appreciated please let me know if you need more info.
We can create a logical index by comparing the absolute value of the dataset with that of 2, get the row wise sum and if there are no values, it will be 0 (by negating !, it returns TRUE for those 0 values and FALSE for others) and subset based on the logical index
df[!rowSums(abs(df[-1])==2),]
# a b c d
#2 yyy 1 1 1
Or another option is to compare within each column using lapply, collapse it to a logical vector with | and use that to subset the rows
df[!Reduce(`|`,lapply(abs(df[-1]), `==`, 2)),]
# a b c d
#2 yyy 1 1 1
We could also do this with tidyverse
library(tidyverse)
df %>%
select(-1) %>% #to remove the first column
map(~abs(.) ==2) %>% #do the columnwise comparison
reduce(`|`) %>% #reduce it to logical vector
`!` %>% #negate to convert TRUE/FALSE to FALSE/TRUE
df[., ] #subset the rows of original dataset
# a b c d
# 2 yyy 1 1 1
data
df <- structure(list(a = c("zzz", "yyy", "xxx"), b = c(2L, 1L, 1L),
c = c(2L, 1L, -1L), d = c(-1L, 1L, -2L)), .Names = c("a",
"b", "c", "d"), class = "data.frame", row.names = c(NA, -3L))
Option with dplyr:
library(dplyr)
a <- c("zzz","yyy","xxx")
b <- c(2,1,1)
c <- c(2,1,-1)
d <- c(-1,1,-2)
df <- data.frame(a,b,c,d)
filter(df,((abs(b) != 2) & (abs(c) != 2) & (abs(d) != 2)))
a b c d
1 yyy 1 1 1

Count strings with a certain condition

I have the following dataset
#mydata
Factors Transactions
a,c 2
b 0
c 0
d,a 0
a 1
a 0
b 1
I'd like to count those factors who had transactions.For example, we had two times "a" with transaction. I can write a code to give me my desirable outcome for each variable separately. The following is for "a".
nrow (subset (mydata,mydata$Transaction > 0 & length(mydata[grep("a", mydata$Factors),] )> 0))
But I have too much variables and do not want to repeat a code for all of them. I would think there should be a way to write a code to give me the results for all of the variables. I wish to have the following out put:
#Output
a 2
b 1
c 1
d 0
Equivalent data.table option:
library(data.table)
setDT(df)[, .(Factors = unlist(strsplit(as.character(Factors), ","))),
by = Transactions][,.(Transactions = sum(Transactions > 0)), by = Factors]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
You could create a table using the unique values of the Factor column as the levels. Consider df to be your data set.
s <- strsplit(as.character(df$Factors), ",", fixed = TRUE)
table(factor(unlist(s[df$Transactions > 0]), levels = unique(unlist(s))))
#
# a c b d
# 2 1 1 0
Wrap in as.data.frame() for data frame output.
with(df, {
s <- strsplit(as.character(Factors), ",", fixed = TRUE)
f <- factor(unlist(s[Transactions > 0]), levels = unique(unlist(s)))
as.data.frame(table(Factors = f))
})
# Factors Freq
# 1 a 2
# 2 c 1
# 3 b 1
# 4 d 0
With tidyverse packages, assuming your data is strings/factors and numbers,
library(tidyr)
library(dplyr)
# separate factors with two elements
df %>% separate_rows(Factors) %>%
# set grouping for aggregation
group_by(Factors) %>%
# for each group, count how many transactions are greater than 0
summarise(Transactions = sum(Transactions > 0))
## # A tibble: 4 x 2
## Factors Transactions
## <chr> <int>
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
You could also avoid dplyr by using xtabs, though some cleaning is necessary to get to the same arrangement:
library(tidyr)
df %>% separate_rows(Factors) %>%
xtabs(Transactions > 0 ~ Factors, data = .) %>%
as.data.frame() %>%
setNames(names(df))
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
A full base R equivalent:
df2 <- do.call(rbind,
Map(function(f, t){data.frame(Factors = strsplit(as.character(f), ',')[[1]],
Transactions = t)},
df$Factors, df$Transactions))
df3 <- as.data.frame(xtabs(Transactions > 0 ~ Factors, data = df2))
names(df3) <- names(df)
df3
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
We can use cSplit from splitstackshape to split the 'Factors' into 'long' format and grouped by 'Factors' we get the sum of logical column ('Transactions > 0`).
library(splitstackshape)
cSplit(df1, "Factors", ",", "long")[, .(Transactions=sum(Transactions > 0)),.(Factors)]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
Or using base R
with(df1, table(factor(unlist(strsplit(Factors[Transactions>0], ",")),
levels = letters[1:4]) ))
# a b c d
# 2 1 1 0
data
df1 <- structure(list(Factors = c("a,c", "b", "c", "d,a", "a", "a",
"b"), Transactions = c(2L, 0L, 0L, 0L, 1L, 0L, 1L)), .Names = c("Factors",
"Transactions"), class = "data.frame", row.names = c(NA, -7L))

Resources