Let's say I have the following simple data.frame:
ID value
1 1 3
2 2 4
3 1 5
4 3 3
My desired output is below, where we add a value to cumsum or we update it according to the latest value of an already used ID.
ID value cumsum
1 1 3 3
2 2 4 7
3 1 5 9
4 3 3 12
In row 3, the new value forms an updated cumsum (7-3+5=9). Row 4 adds a new value to cumsum because the ID was not used before (4+5+3=12).
This produces the desired outcome for your example:
df<-read.table(header=T, text="ID value
1 1 3
2 2 4
3 1 5
4 3 3")
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(value = value-lag(value, def = 0L)) %>%
ungroup %>% mutate(cumsum = cumsum(value))
# # A tibble: 4 x 3
# ID value cumsum
# <int> <int> <int>
# 1 1 3 3
# 2 2 4 7
# 3 1 2 9
# 4 3 3 12
I used data.table for cumsum. Calculating the cumulative mean is a bit more tricky because the number of oberservations is not adjusted by just using cummean.
library(data.table)
dt = data.table(id = c(1, 2, 1, 3), value = c(3, 4, 5, 3))
dt[, tmp := value-shift(value, n = 1L, type = "lag", fill = 0), by = c("id")]
#CUMSUM
dt[, cumsum := cumsum(tmp)]
#CUMMEAN WITH UPDATED N
dt[value != tmp, skip := 1:.N]
dt[, skip := na.locf(skip, na.rm = FALSE)]
dt[is.na(skip), skip := 0]
dt[, cummean := cumsum(tmp)/(seq_along(tmp)-skip)]
Output is:
id value tmp cumsum skip cummean
1: 1 3 3 3 0 3.0
2: 2 4 4 7 0 3.5
3: 1 5 2 9 1 4.5
4: 3 3 3 12 1 4.0
Edit: Changed lag function to data.table's shift function.
Related
I am scoring a survey (23 items) with multiple options for each item (it is select all answers that apply, not one choice per item), and trying to find the minimum, maximum and average value for each item. I have written code to do this (below) but am wondering if there is a more efficient way than to cut and paste the three lines creating min, max and avg columns for every item.
Here is the reproducible example with 2 questions, 3 answer options for questions:
#Establish dataframe
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
#Rescore incorrect value
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)
This is the code to calculate the values (note here Option A, B and C are the answer choices, while the _1 denotes item 1)
##Calculate Values
dt$it1_min <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, min, na.rm=T)
dt$it1_max <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, max, na.rm=T)
dt$it1_avg <- rowMeans(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], na.rm=T)
I am wondering if I need to do the above ^ for every single item, or if it's possible to write a function so that I can score all items (OptA_1 through OptA_23) more efficiently.
###potentially repetitive code?
dt$it2_min <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, min, na.rm=T)
dt$it2_max <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, max, na.rm=T)
dt$it2_avg <- rowMeans(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], na.rm=T)
Here is what the eventual scoring will look like:
##Eventual scoring
dt$tot_min <- rowSums(dt[ ,c("it1_min", "it2_min")], na.rm=T)
dt$tot_max <- rowSums(dt[ ,c("it1_max", "it2_max")], na.rm=T)
dt$tot_avg <- rowSums(dt[ ,c("it1_avg", "it2_avg")], na.rm=T)
You will need to convert the data to long form first (tidyr::pivot_longer).
library(dplyr)
library(tidyr)
dt_long <- dt %>%
# add identifier for participant
mutate(participant = row_number()) %>%
# convert to long form using pattern
pivot_longer(cols = -participant,
names_pattern = "Opt(.*)_(\\d+)",
names_to = c("answer_choice", "item"),
values_to = "selected")
dt_long
# long form data looks like this
# A tibble: 30 x 4
# participant answer_choice item selected
# <int> <chr> <chr> <dbl>
# 1 1 A 1 NA
# 2 1 B 1 2
# 3 1 C 1 NA
# 4 1 A 2 1
# 5 1 B 2 2
# 6 1 C 2 3
# now group by each participant and item and compute the required fields
dt_long %>%
group_by(item, participant) %>%
summarise(it_min = min(selected, na.rm = TRUE),
it_max = max(selected, na.rm = TRUE),
it_avg = mean(selected, na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Groups: item [2]
#> item participant it_min it_max it_avg
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 1 1 2 2 2
#> 2 1 2 2 2 2
#> 3 1 3 1 3 2
#> 4 1 4 2 3 2.5
#> 5 1 5 3 3 3
#> 6 2 1 1 1 1
#> 7 2 2 1 3 2
#> 8 2 3 1 3 2
#> 9 2 4 1 3 2
#> 10 2 5 1 2 1.5
You can use data.table to melt your dt long, estimate your indicators by group, and then dcast back to wide format:
library(data.table)
dt = melt(setDT(dt)[, row:=.I], id.vars="row")[, c("variable","grp") := tstrsplit(variable, "_")][]
dcast(dt[, .(it_min = min(value,na.rm=T),
it_max = max(value,na.rm=T),
it_avg = mean(value, na.rm=T)
), by=.(row,grp)],
row~grp,
value.var=c("it_min", "it_max", "it_avg")
)
Output: (note that you used sample() above, without setting a seed, see my reproducible data below)
row it_min_1 it_min_2 it_max_1 it_max_2 it_avg_1 it_avg_2
<int> <num> <num> <num> <num> <num> <num>
1: 1 2 NA 3 NA 2.5 NaN
2: 2 2 1 3 3 2.5 2
3: 3 2 3 3 3 2.5 3
4: 4 1 NA 1 NA 1.0 NaN
5: 5 3 3 3 3 3.0 3
Input Data:
set.seed(123)
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)
I have a larger data frame that has multiple columns and thousands of rows. I want to replace the value of every lead row by subtracting the previous row value from the lead row for every five rows of the data frame. For example, the first value should retain its value, the second row should be: second row - first row. Similarly, the sixth row should retain its value, however, the seventh row would be seventh row - sixth row. Here is an example data frame
DF = data.frame(A= c(1:11), B = c(11:21))
The outputput should be like below
> Output
A B
1 1 11
2 1 1
3 1 1
4 1 1
5 1 1
6 6 16
7 1 1
8 1 1
9 1 1
10 1 1
11 11 21
One option would be to create a grouping variable and then do the transformation with diff which does the difference of adjacent elements of the columns selected in mutate_all (if only a subset of columns are needed either use mutate_if or mutate_at)
library(dplyr) #v_0.8.3
DF %>%
group_by(grp = as.integer(gl(n(), 5, n()))) %>%
mutate_all(~c(first(.), diff(.))) %>%
ungroup %>%
select(-grp)
# A tibble: 11 x 2
# A B
# <int> <int>
# 1 1 11
# 2 1 1
# 3 1 1
# 4 1 1
# 5 1 1
# 6 6 16
# 7 1 1
# 8 1 1
# 9 1 1
#10 1 1
#11 11 21
The above also gives a warning when we use mutate_all after group_by (previously it used to work - in the new versions, the correct syntax would be to use mutate_at
DF %>%
group_by(grp = as.integer(gl(n(), 5, n()))) %>%
mutate_at(vars(-group_cols()), ~c(first(.), diff(.))) %>%
ungroup %>%
select(-grp)
f = function(d, n = 5) ave(d, ceiling(seq_along(d)/n), FUN = function(x) c(x[1], diff(x)))
data.frame(lapply(DF, f))
# A B
#1 1 11
#2 1 1
#3 1 1
#4 1 1
#5 1 1
#6 6 16
#7 1 1
#8 1 1
#9 1 1
#10 1 1
#11 11 21
Another option would be to create another data.frame with shifted rows and subtract directly
ind = ave(1:nrow(DF), ceiling(1:nrow(DF)/5), FUN = function(x) c(x[1], x[-length(x)]))
DF2 = DF[ind,] * replace(rep(1, nrow(DF)), diff(ind) == 0, 0)
DF - DF2
You can %/% the row number minus 1 by 5 to get the groups, then use diff to get the difference from the previous x (or 0 if there is no previous x) from x for all columns x for each group.
library(data.table)
setDT(DF)
DF[, lapply(.SD, function(x) diff(c(0, x)))
, (1:nrow(DF) - 1) %/% 5][, -1]
# A B
# 1: 1 11
# 2: 1 1
# 3: 1 1
# 4: 1 1
# 5: 1 1
# 6: 6 16
# 7: 1 1
# 8: 1 1
# 9: 1 1
# 10: 1 1
# 11: 11 21
Or, as mentioned by #akrun, you could avoid lapply by replacing
lapply(.SD, function(x) diff(c(0, x)))
with
.SD - shift(.SD, fill = 0)
Another less serious option:
x <- DF[, !(.I - 1) %% 5]
DF*(1 + x) - DF[DF[, .I - !x]]
# A B
# 1: 1 11
# 2: 1 1
# 3: 1 1
# 4: 1 1
# 5: 1 1
# 6: 6 16
# 7: 1 1
# 8: 1 1
# 9: 1 1
# 10: 1 1
# 11: 11 21
I have a data with about 1000 groups each group is ordered from 1-100(can be any number within 100).
As I was looking through the data. I found that some groups had bad orders, i.e., it would order to 100 then suddenly a 24 would show up.
How can I delete all of these error data
As you can see from the picture above(before -> after), I would like to find all rows that don't follow the order within the group and just delete it.
Any help would be great!
lag will compute the difference between the current value and the previous value, diff will be used to select only positive difference i.e. the current value is greater than the previous value. min is used as lag give the first value NA. I keep the helper column diff to check, but you can deselect using %>% select(-diff)
library(dplyr)
df1 %>% group_by(gruop) %>% mutate(diff = order-lag(order)) %>%
filter(diff >= 0 | order==min(order))
# A tibble: 8 x 3
# Groups: gruop [2]
gruop order diff
<int> <int> <int>
1 1 1 NA
2 1 3 2
3 1 5 2
4 1 10 5
5 2 1 NA
6 2 4 3
7 2 4 0
8 2 8 4
Data
df1 <- read.table(text="
gruop order
1 1
1 3
1 5
1 10
1 2
2 1
2 4
2 4
2 8
2 3
",header=T, stringsAsFactors = F)
Assuming the order column increments by 1 every time we can use ave where we remove those rows which do not have difference of 1 with the previous row by group.
df[!ave(df$order, df$group, FUN = function(x) c(1, diff(x))) != 1, ]
# group order
#1 1 1
#2 1 2
#3 1 3
#4 1 4
#6 2 1
#7 2 2
#8 2 3
#9 2 4
EDIT
For the updated example, we can just change the comparison
df[ave(df$order, df$group, FUN = function(x) c(1, diff(x))) >= 0, ]
Playing with data.table:
library(data.table)
setDT(df1)[, diffo := c(1, diff(order)), group][diffo == 1, .(group, order)]
group order
1: 1 1
2: 1 2
3: 1 3
4: 1 4
5: 2 1
6: 2 2
7: 2 3
8: 2 4
Where df1 is:
df1 <- data.frame(
group = rep(1:2, each = 5),
order = c(1:4, 2, 1:4, 3)
)
EDIT
If you only need increasing order, and not steps of one then you can do:
df3 <- transform(df1, order = c(1,3,5,10,2,1,4,7,9,3))
setDT(df3)[, diffo := c(1, diff(order)), group][diffo >= 1, .(group, order)]
group order
1: 1 1
2: 1 3
3: 1 5
4: 1 10
5: 2 1
6: 2 4
7: 2 7
8: 2 9
This question already has answers here:
Numbering rows within groups in a data frame
(10 answers)
Closed 5 years ago.
I have data, stored as a data.table dt or a data.frame df, with multiple observations per id-month combination. I want to store the row number in a variable, let's call it row.
I know how to do this in dplyr but want to learn how to do it in (pure) data.table. I assume it is a trivially easy operation, but I can't seem to find a solution that works.
Reprex:
library(dplyr)
library(data.table)
df <- data_frame(id = c(1, 1, 1, 2, 2, 2), month = c(1, 1, 2, 1, 1, 2))
dt <- data.table(df)
My dplyr solution gives the expected output:
df %>%
group_by(id, month) %>%
mutate(row = row_number(id))
# A tibble: 6 x 3
# Groups: id, month [4]
id month row
<dbl> <dbl> <int>
1 1 1 1
2 1 1 2
3 1 2 1
4 2 1 1
5 2 1 2
6 2 2 1
Doing similar operations on a data.table yields something different:
dt[, row := row_number(id), by = c("id", "month")]
id month row
1: 1 1 1
2: 1 1 1
3: 1 2 1
4: 2 1 1
5: 2 1 1
6: 2 2 1
Or:
dt[, row := .I, by = c("id", "month")]
id month row
1: 1 1 1
2: 1 1 2
3: 1 2 3
4: 2 1 4
5: 2 1 5
6: 2 2 6
I assume I understand why this happens (row_number(id) simply refers to the first row number of the first row of each group), but do not know how to get the expected result in pure data.table.
dt[, row := row.names(dt), by = c("id", "month")]
dt
id month row
1: 1 1 1
2: 1 1 2
3: 1 2 1
4: 2 1 1
5: 2 1 2
6: 2 2 1
I have a dataframe as follows. It is ordered by column time.
Input -
df = data.frame(time = 1:20,
grp = sort(rep(1:5,4)),
var1 = rep(c('A','B'),10)
)
head(df,10)
time grp var1
1 1 1 A
2 2 1 B
3 3 1 A
4 4 1 B
5 5 2 A
6 6 2 B
7 7 2 A
8 8 2 B
9 9 3 A
10 10 3 B
I want to create another variable var2 which computes no of distinct var1 values so far i.e. until that point in time for each group grp . This is a little different from what I'd get if I were to use n_distinct.
Expected output -
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
I want to create a function say cum_n_distinct for this and use it as -
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))
A dplyr solution inspired from #akrun's answer -
Ths logic is basically to set 1st occurrence of each unique values of var1 to 1 and rest to 0 for each group grp and then apply cumsum on it -
df = df %>%
arrange(time) %>%
group_by(grp,var1) %>%
mutate(var_temp = ifelse(row_number()==1,1,0)) %>%
group_by(grp) %>%
mutate(var2 = cumsum(var_temp)) %>%
select(-var_temp)
head(df,10)
Source: local data frame [10 x 4]
Groups: grp
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
Assuming stuff is ordered by time already, first define a cumulative distinct function:
dist_cum <- function(var)
sapply(seq_along(var), function(x) length(unique(head(var, x))))
Then a base solution that uses ave to create groups (note, assumes var1 is factor), and then applies our function to each group:
transform(df, var2=ave(as.integer(var1), grp, FUN=dist_cum))
A data.table solution, basically doing the same thing:
library(data.table)
(data.table(df)[, var2:=dist_cum(var1), by=grp])
And dplyr, again, same thing:
library(dplyr)
df %>% group_by(grp) %>% mutate(var2=dist_cum(var1))
Try:
Update
With your new dataset, an approach in base R
df$var2 <- unlist(lapply(split(df, df$grp),
function(x) {x$var2 <-0
indx <- match(unique(x$var1), x$var1)
x$var2[indx] <- 1
cumsum(x$var2) }))
head(df,7)
# time grp var1 var2
# 1 1 1 A 1
# 2 2 1 B 2
# 3 3 1 A 2
# 4 4 1 B 2
# 5 5 2 A 1
# 6 6 2 B 2
# 7 7 2 A 2
Here's another solution using data.table that's pretty quick.
Generic Function
cum_n_distinct <- function(x, na.include = TRUE){
# Given a vector x, returns a corresponding vector y
# where the ith element of y gives the number of unique
# elements observed up to and including index i
# if na.include = TRUE (default) NA is counted as an
# additional unique element, otherwise it's essentially ignored
temp <- data.table(x, idx = seq_along(x))
firsts <- temp[temp[, .I[1L], by = x]$V1]
if(na.include == FALSE) firsts <- firsts[!is.na(x)]
y <- rep(0, times = length(x))
y[firsts$idx] <- 1
y <- cumsum(y)
return(y)
}
Example Use
cum_n_distinct(c(5,10,10,15,5)) # 1 2 2 3 3
cum_n_distinct(c(5,NA,10,15,5)) # 1 2 3 4 4
cum_n_distinct(c(5,NA,10,15,5), na.include = FALSE) # 1 1 2 3 3
Solution To Your Question
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))