How to check previous row value with present row value dynamically for all column of data frame by grouping specific ID.
my data frame:
ID ITEM1 ITEM2 ITEM3
1 A A A
2 C B C
1 A B C
1 B A C
2 NA B F
3 A A D
4 R G J
4 H T J
For Ex:
ID ITEM1 ITEM2 ITEM3 ITEM1change ITEM2change ITEM3change
1 A A A 0 0 0
1 A B C 0 1 1
1 B A C 1 1 0
2 C B C 0 0 0
2 NA B F 1 0 1
3 A A D 0 0 0
4 R G J 0 0 0
4 H T J 1 1 0
My final output will be:
Fiels modifiedcout unmodifiedcount Total
ITEM1change 3 5 8
ITEM2change 3 5 8
ITEM3change 2 6 8
my data:
structure(list(ID = c(1, 2, 1, 1, 2, 3, 4, 4), ITEM1 = structure(c(1L,
3L, 1L, 2L, NA, 1L, 5L, 4L), .Label = c("A", "B", "C", "H", "R"
), class = "factor"), ITEM2 = structure(c(1L, 2L, 2L, 1L, 2L,
1L, 3L, 4L), .Label = c("A", "B", "G", "T"), class = "factor"),
ITEM3 = structure(c(1L, 2L, 2L, 2L, 4L, 3L, 5L, 5L), .Label = c("A",
"C", "D", "F", "J"), class = "factor")), .Names = c("ID",
"ITEM1", "ITEM2", "ITEM3"), row.names = c(NA, -8L), class = "data.frame")
A possible solution:
library(dplyr)
library(tidyr)
df %>%
gather(item, value, -1) %>%
group_by(ID, item) %>%
mutate(change = lag(value, default = first(value)) != value,
change = replace(change, is.na(change), TRUE)) %>%
group_by(item) %>%
summarise(modified = sum(change, na.rm = TRUE),
unmodified = sum(!change, na.rm = TRUE)) %>%
mutate(total = modified + unmodified)
which gives:
# A tibble: 3 x 4
item modified unmodified total
<chr> <int> <int> <int>
1 ITEM1 3 5 8
2 ITEM2 3 5 8
3 ITEM3 2 6 8
Here is another idea using rollapply from zoo. By using rollapply with width = 2, we are testing if x is not equal with x-1. Wrapping it in as.integer gives 1s (TRUE) and 0s (FALSE). We then replace all NAs with 1 since you consider them as being modified, and use colSums to sum the modified/unmodified elements. The total is just the number of rows of the original data frame.
library(zoo)
m1 <- do.call(rbind, lapply(split(df, df$ID), function(i)
sapply(i[-1], function(j)
as.integer(c(FALSE, rollapply(j, 2, function(k) k[1] != k[2]))))))
m1 <- replace(m1, is.na(m1), 1)
#giving
# ITEM1 ITEM2 ITEM3
# 0 0 0
# 0 1 1
# 1 1 0
# 0 0 0
# 1 0 1
#3 0 0 0
# 0 0 0
# 1 1 0
To get your expected data frame,
final_df <- data.frame(modified = colSums(m1 == 1),
unmodified = colSums(m1 != 1),
Total = nrow(df), stringsAsFactors = FALSE)
which gives,
modified unmodified Total
ITEM1 3 5 8
ITEM2 3 5 8
ITEM3 2 6 8
The given data has many columns of the same type. This strongly suggests that the data better be stored in long format rather than in wide format.
Jaap's solution is reshaping the data using tidyr / dplyr.
However, I would like to suggest a data.tablesolution which does not reshape the data. In addition, it avoids to handle NA values separately.
library(data.table)
# coerce to data.table, loop over columns and determine changes to previous row by ID
tmp <- setDT(DF)[, lapply(.SD, function(x) x == shift(x, fill = x[1])), by = ID]
tmp
ID ITEM1 ITEM2 ITEM3
1: 1 TRUE TRUE TRUE
2: 1 TRUE FALSE FALSE
3: 1 FALSE FALSE TRUE
4: 2 TRUE TRUE TRUE
5: 2 NA TRUE FALSE
6: 3 TRUE TRUE TRUE
7: 4 TRUE TRUE TRUE
8: 4 FALSE FALSE TRUE
Now, we can count the unchanged rows:
tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"]
ITEM1 ITEM2 ITEM3
1: 5 5 6
From here, OP's expected result can be achieved in two different ways
using melt()
melt(tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"]
, measure.vars = patterns("^ITEM"),
variable.name = "item",
value.name = "unmodified")[
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
or by transposing:
as.data.table(
t(tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"])
, keep.rownames = "item")[, setnames(.SD, "V1", "unmodified")][
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
Both return the same result:
item unmodified modified Total
1: ITEM1 5 3 8
2: ITEM2 5 3 8
3: ITEM3 6 2 8
For the sake of completeness, here is also a data.table implementation of the reshape approach. As above, NA are handled by counting the unmodified rows first excluding any NA.
melt(setDT(DF), id.vars = "ID", variable.name = "item")[
, value == shift(value, fill = value[1L]), by = .(ID, item)][
, .(unmodified = sum(V1, na.rm = TRUE)), by = item][
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
If dat is your data, then try:
Create ITEMCHANGE variables
dat["ITEM1Change"] <- c(NA, head(dat["ITEM1"], dim(dat)[1] - 1)[[1]])
dat["ITEM2Change"] <- c(NA, head(dat["ITEM2"], dim(dat)[1] - 1)[[1]])
dat["ITEM3Change"] <- c(NA, head(dat["ITEM3"], dim(dat)[1] - 1)[[1]])
Then compare if there are changes
dat$ITEM1Change <- ifelse(dat$ITEM1Change == dat$ITEM1, 0, 1)
dat$ITEM2Change <- ifelse(dat$ITEM2Change == dat$ITEM2, 0, 1)
dat$ITEM3Change <- ifelse(dat$ITEM3Change == dat$ITEM2, 0, 1)
Then group and summarize
library(dplyr)
dat %>%
group_by("ITEM1") %>%
summarise_at(.funs = sum, .vars = "ITEM1Change") -> ITEM1Change
etc.
Is this what you need?
Related
I want to merge them and find the values of one dataframe that would like to be added to the existing values of the other based on the same columns.
For example:
df1
No
A
B
C
D
1
1
0
1
0
2
0
1
2
1
3
0
0
1
0
df2
No
A
B
E
F
1
1
0
1
1
2
0
1
2
1
3
2
1
1
0
Finally, I want the output table like this.
df
No
A
B
C
D
E
F
1
2
0
1
0
1
1
2
0
2
2
1
2
1
3
2
1
1
0
1
0
Note: I did try merge(), but in this case, it did not work.
Any help/suggestion would be appreciated.
Reproducible sample data
df1 <-
structure(list(No = 1:3, A = c(1L, 0L, 0L), B = c(0L, 1L, 0L),
C = c(1L, 2L, 1L), D = c(0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
df2 <-
structure(list(No = 1:3, A = c(1L, 0L, 2L), B = c(0L, 1L, 1L),
E = c(1L, 2L, 1L), F = c(1L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
You can also carry out this operation by left_joining these two data frames:
library(dplyr)
library(stringr)
df1 %>%
left_join(df2, by = "No") %>%
mutate(across(ends_with(".x"), ~ .x + get(str_replace(cur_column(), "\\.x", "\\.y")))) %>%
rename_with(~ str_replace(., "\\.x", ""), ends_with(".x")) %>%
select(!ends_with(".y"))
No A B C D E F
1 1 2 0 1 0 1 1
2 2 0 2 2 1 2 1
3 3 2 1 1 0 1 0
You can first row-bind the two dataframes and then compute the sum of each column while 'grouping' by the No column. This can be done like so:
library(dplyr)
bind_rows(df1, df2) %>%
group_by(No) %>%
summarise(across(c(A, B, C, D, E, `F`), sum, na.rm = TRUE),
.groups = "drop")
If a particular column doesn't exist in one dataframe (i.e. columns E and F), values will be padded with NA. Adding the na.rm = TRUE argument (to be passed to sum()) means that these values will get treated like zeros.
Using data.table :
library(data.table)
rbindlist(list(df1, df2), fill = TRUE)[, lapply(.SD, sum, na.rm = TRUE), No]
# No A B C D E F
#1: 1 2 0 1 0 1 1
#2: 2 0 2 2 1 2 1
#3: 3 2 1 1 0 1 0
We can use base R (with R 4.1.0). Get the values of the objects in a list ('lst1'). Then, find the union of the column names ('nm1'). Loop over the list assign to create 0 value columns with setdiff in each list element, rbind them and use aggregate to get the sum grouped by 'No'
lst1 <- mget(ls(pattern= '^df\\d+$'))
nm1 <- lapply(lst1, names) |>
{\(x) Reduce(union, x)}()
lapply(lst1, \(x) {x[setdiff(nm1, names(x))] <- 0; x}) |>
{\(x) do.call(rbind, x)}() |>
{\(dat) aggregate(.~ No, data = dat, FUN = sum, na.rm = TRUE,
na.action = na.pass)}()
# No A B C D E F
#1 1 2 0 1 0 1 1
#2 2 0 2 2 1 2 1
#3 3 2 1 1 0 1 0
I am interested in identifying individuals that go from a 0 to a .5 or a 1. Here is example data:
id x
1 0
1 1
1 1
2 0
2 .5
2 .5
3 0
3 0
I want a new df that indicates whether or not each person moved from a 0 to .5 or 1. Something like this;
id endorsed
1 TRUE
2 TRUE
3 FALSE
I tried using ifelse for this, but I just can't get the code right. Does anyone have any suggestions?
An option to do a group by 'id', check for any values in 'x' that is a 0 and the next value is either 0.5 or 1 (using lead) (Assume that the precision is right)
library(dplyr)
df1 %>%
group_by(id) %>%
summarise(endorsed = any(x == 0 & lead(x) %in% c( 0.5, 1)))
-output
# A tibble: 3 x 2
# id endorsed
#* <int> <lgl>
#1 1 TRUE
#2 2 TRUE
#3 3 FALSE
data
df1 <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), x = c(0,
1, 1, 0, 0.5, 0.5, 0, 0)), class = "data.frame", row.names = c(NA,
-8L))
Since they always start with zero, you can just group and ask whether they ever hit 0.5 or 1:
df1 %>% group_by(id) %>% summarize(endorsed = any(x %in% c(0.5, 1)))
A base R option using aggregate
aggregate(
cbind(endorsed = x) ~ id,
df,
function(v) head(v, 1) == 0 & tail(cummax(v), 1) %in% c(.5, 1)
)
gives
id endorsed
1 1 TRUE
2 2 TRUE
3 3 FALSE
I have a column of numbers (index) in a dataframe like the below. I am attempting to check if these numbers are in ascending order by the value of 1. For example, group B and C do not ascend by 1. While I can check by sight, my dataframe is thousands of rows long, so I'd prefer to automate this. Does anyone have advice? Thank you!
group index
A 0
A 1
A 2
A 3
A 4
B 0
B 1
B 2
B 2
C 0
C 3
C 1
C 2
...
I think this works. diff calculates the difference between the two subsequent numbers, and then we can use all to see if all the differences are 1. dat2 is the final output.
library(dplyr)
dat2 <- dat %>%
group_by(group) %>%
summarize(Result = all(diff(index) == 1)) %>%
ungroup()
dat2
# # A tibble: 3 x 2
# group Result
# <chr> <lgl>
# 1 A TRUE
# 2 B FALSE
# 3 C FALSE
DATA
dat <- read.table(text = "group index
A 0
A 1
A 2
A 3
A 4
B 0
B 1
B 2
B 2
C 0
C 3
C 1
C 2",
header = TRUE, stringsAsFactors = FALSE)
Maybe aggregate could help
> aggregate(.~group,df1,function(v) all(diff(v)==1))
group index
1 A TRUE
2 B FALSE
3 C FALSE
We can do a group by group, get the difference between the current and previous value (shift) and check if all the differences are equal to 1.
library(data.table)
setDT(df1)[, .(Result = all((index - shift(index))[-1] == 1)), group]
# group Result
#1: A TRUE
#2: B FALSE
#3: C FALSE
data
df1 <- structure(list(group = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "C", "C", "C", "C"), index = c(0L, 1L, 2L, 3L, 4L, 0L, 1L,
2L, 2L, 0L, 3L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-13L))
I want to loop through a large dataframe counting in the first column how many values >0, removing those rows that were counted.... then moving on to column 2 counting the number of values>0 and removing those rows etc...
the data frame
taxonomy A B C
1 cat 0 2 0
2 dog 5 1 0
3 horse 3 0 0
4 mouse 0 0 4
5 frog 0 2 4
6 lion 0 0 2
can be generated with
DF1 = structure(list(taxonomy = c("cat", "dog","horse","mouse","frog", "lion"),
A = c(0L, 5L, 3L, 0L, 0L, 0L), D = c(2L, 1L, 0L, 0L, 2L, 0L), C = c(0L, 0L, 0L, 4L, 4L, 2L)),
.Names = c("taxonomy", "A", "B", "C"),
row.names = c(NA, -6L), class = "data.frame")
and i expect the outcome to be
A B C
count 2 2 2
i wrote this loop but it does not remove the rows as it goes
res <- data.frame(DF1[1,], row.names = c('count'))
for(n in 1:ncol(DF1)) {
res[colnames(DF1)[n]] <- sum(DF1[n])
DF1[!DF1[n]==1]
}
it gives this incorrect result
A B C
count 2 3 3
You could do ...
DF = DF1[, -1]
cond = DF != 0
p = max.col(cond, ties="first")
fp = factor(p, levels = seq_along(DF), labels = names(DF))
table(fp)
# A B C
# 2 2 2
To account for rows that are all zeros, I think this works:
fp[rowSums(cond) == 0] <- NA
We can update the dataset in each run. Create a temporary dataset without the 'taxonomy' column ('tmp'). Initiate a named vector ('n'), loop through the columns of 'tmp', get a logical index based on whether the column is greater than 0 ('i1'), get the sum of TRUE values, update the 'n' for the corresponding column, then update the 'tmp' by removing those rows using 'i1' as row index
tmp <- DF1[-1]
n <- setNames(numeric(ncol(tmp)), names(tmp))
for(i in seq_len(ncol(tmp))) {
i1 <- tmp[[i]] > 0
n[i] <- sum(i1)
tmp <- tmp[!i1, ]}
n
# A B C
# 2 2 2
It can also be done with Reduce
sapply(Reduce(function(x, y) y[!x] > 0, DF1[3:4],
init = DF1[,2] > 0, accumulate = TRUE ), sum)
#[1] 2 2 2
Or using accumulate from purrr
library(purrr)
accumulate(DF1[3:4], ~ .y[!.x] > 0, .init = DF1[[2]] > 0) %>%
map_int(sum)
#[1] 2 2 2
This is easy with Reduce and sapply:
> first <- Reduce(function(a,b) b[a==0], df[-1], accumulate=TRUE)
> first
[[1]]
[1] 0 5 3 0 0 0
[[2]]
[1] 2 0 2 0
[[3]]
[1] 0 4 2
> then <- sapply(setNames(first, names(df[-1])), function(x) length(x[x>0]))
> then
A B C
2 2 2
I've been using the dplyr package to create aggregated data tables, for example using the following code:
agg_data <- df %>%
select(calc.method, price1, price2) %>%
group_by(calc.method) %>%
summarize(
count = n(),
mean_price1 = round(mean(price1, na.rm = TRUE),2),
mean_price2 = round(mean(price2, na.rm = TRUE),2))
However, I would like to only calculate the mean over the distinct values of price1 and price2 within groups
e.g:
Price1: 1 1 2 1 2 2 1
Goes to (before aggregation):
Price1: 1 2 1 2 1
(and these in general don't have the same numbers of after removal for price1 and price2). I would also like to calculate a count for each (price1 and price2), counting only distinct values within groups. (Groups are defined as two or more identical values adjacent to each other)
I have tried:
agg_data <- df %>%
select(calc.method, price1, price2) %>%
group_by(calc.method) %>%
summarize(
count = n(),
mean_price1 = round(mean(distinct(price1), na.rm = TRUE),2),
mean_price2 = round(mean(distinct(price2), na.rm = TRUE),2))
And also tried wrapping the columns within the select function with distinct(), but both these throw errors.
Is there a way to do this using dplyr or another similar package without having to write something from scratch?
To satisfy your requirement for distinct, we need to remove successive values that are the same. For numeric vectors, this can be accomplished by:
x <- x[c(1, which(diff(x) != 0)+1)]
The default use of diff computes the difference between adjoining elements in the vector. We use this to detect successive values that are different, for which diff(x) != 0. Since the output differences are lagged by 1, we add 1 to the indices of these distinct elements, and we also want the first element as distinct. For example:
x <- c(1,1,2,1,2,2,1)
x <- x[c(1, which(diff(x) != 0)+1)]
##[1] 1 2 1 2 1
We can then use this with dplyr:
agg_data <- df %>% group_by(calc.method) %>%
summarize(count = n(),
count_non_rep_1 = length(price1[c(1,which(diff(price1) != 0)+1)]),
mean_price1 = round(mean(price1[c(1,which(diff(price1) != 0)+1)], na.rm=TRUE),2),
count_non_rep_2 = length(price2[c(1,which(diff(price2) != 0)+1)]),
mean_price2 = round(mean(price2[c(1,which(diff(price2) != 0)+1)], na.rm=TRUE),2))
or, better yet, define the function:
remove.repeats <- function(x) {
x[c(1,which(diff(x) != 0)+1)]
}
and use it with dplyr:
agg_data <- df %>% group_by(calc.method) %>%
summarize(count = n(),
count_non_rep_1 = length(remove.repeats(price1)),
mean_price1 = round(mean(remove.repeats(price1), na.rm=TRUE),2),
count_non_rep_2 = length(remove.repeats(price2)),
mean_price2 = round(mean(remove.repeats(price2), na.rm=TRUE),2))
Using this on some example data that is hopefully similar to yours:
df <- structure(list(calc.method = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
price1 = c(1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 3),
price2 = c(1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1)),
.Names = c("calc.method", "price1", "price2"), row.names = c(NA, -15L), class = "data.frame")
## calc.method price1 price2
##1 A 1 1
##2 A 1 1
##3 A 2 1
##4 A 1 1
##5 A 2 1
##6 A 2 1
##7 A 1 1
##8 B 1 2
##9 B 1 1
##10 B 2 2
##11 B 2 1
##12 B 2 2
##13 B 2 1
##14 B 1 2
##15 B 3 1
We get:
print(agg_data)
### A tibble: 2 x 6
## calc.method count count_non_rep_1 mean_price1 count_non_rep_2 mean_price2
## <fctr> <int> <int> <dbl> <int> <dbl>
##1 A 7 5 1.40 1 1.0
##2 B 8 4 1.75 8 1.5