create new column using differences of rows - r

I have a dataset as below.
How can I create a new column B using the difference of values in A with matching ID. Apologies if this has been asked before. Thanks

Using dplyr, we can group_by ID and subtract first and last values of A.
library(dplyr)
df %>%
group_by(ID) %>%
summarise(B = first(A) - last(A), A = first(A)) %>%
select(names(df), B)
# A tibble: 4 x 3
# ID A B
# <fct> <dbl> <dbl>
#1 aa 2 -1
#2 bb 4 0
#3 cc 3 1
#4 dd 1 0
data
df <- structure(list(ID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L
), .Label = c("aa", "bb", "cc", "dd"), class = "factor"), A = c(2,
4, 3, 1, 3, 4, 2, 1)), class = "data.frame", row.names = c(NA, -8L))

We can use data.table methods
library(data.table)
setDT(df)[, .(B = first(A) - last(A), A = first(A)), .(ID)]
data
df <- structure(list(ID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L
), .Label = c("aa", "bb", "cc", "dd"), class = "factor"), A = c(2,
4, 3, 1, 3, 4, 2, 1)), class = "data.frame", row.names = c(NA, -8L))

Another approach could be to pivot the table so that the two 'A' values are in separate columns.
library(tidyverse)
df %>%
mutate(name = if_else(duplicated(ID), "A_additional", "A")) %>%
pivot_wider(id_cols = ID, values_from = A, names_from = name) %>%
mutate(B = A - A_additional)
# # A tibble: 4 x 4
# ID A A_additional B
# <fct> <dbl> <dbl> <dbl>
# 1 aa 2 3 -1
# 2 bb 4 4 0
# 3 cc 3 2 1
# 4 dd 1 1 0
This solution doesn't require grouping, so should scale well to larger data sets.

Related

How to do a binary if statement in R?

I have a data set with a column of letters, followed by another column of ones and zeroes. I want to total the amount of "ones" for each letter, but am unsure how to do so in an effective way.
I appreciate the help.
We can group by the first column ('col1') and then get the sum of 'col2'
library(dplyr)
df1 %>%
group_by(col1) %>%
summarise(Total = sum(col2))
Or in data.table
library(data.table)
setDT(df1)[, .(Total = sum(col2)), col1]
Or with base R
rowsum(df1$col2, df1$col1)
Here are some other base R solutions
> tapply(df$col2, df$col1, sum)
a b c
1 1 2
> xtabs(col2~col1,df)
col1
a b c
1 1 2
Dummy Data
df <- structure(list(col1 = structure(c(1L, 3L, 1L, 2L, 1L, 3L, 3L,
2L, 2L, 3L), .Label = c("a", "b", "c"), class = "factor"), col2 = c(0,
0, 0, 0, 1, 1, 1, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-10L))
> df
col1 col2
1 a 0
2 c 0
3 a 0
4 b 0
5 a 1
6 c 1
7 c 1
8 b 1
9 b 0
10 c 0

Using filter and sample in a grouped dataframe

I would like to get two IDs randomly sampled from a predefined set of IDs.
However, Using sample with dplyr::filter on grouped dataframe returns unexpected results "different sample size", e.g if I do sample(x,2) sometimes I get 2 sometimes I get a number not equal to 2.
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L,
5L, 5L, 6L, 6L), Sub = structure(c(1L, 1L, 1L, 2L, 2L, 3L, 3L,
4L, 4L, 4L, 5L, 5L, 6L, 6L), .Label = c("a", "b", "c", "d", "f",
"g"), class = "factor")), class = "data.frame", row.names = c(NA,
-14L))
samp.vec <- c(1,2,3,4,5)
library(dplyr)
set.seed(123)
#Return Different sample size, Not working
df %>% group_by(ID)%>%filter(ID %in% sample(samp.vec,2)) %>% count(ID)
df %>% group_by(ID)%>%filter(ID %in% sample(samp.vec,2)) %>% count(ID)
set.seed(123)
#Return one sample size, Working
df %>% group_by(ID)%>% ungroup() %>% filter(ID %in% sample(samp.vec,2)) %>% count(ID)
df %>% group_by(ID)%>% ungroup() %>% filter(ID %in% sample(samp.vec,2)) %>% count(ID)
One solution is to use ungroup() before filter. Does anyone know why this is happening?
When you are grouping, you are doing the operation for each group. So you don't just have one pair of IDs, like the fixed ID %in% c(2, 3). To make this more clear, let's omit filter and lets see the results of sample(samp.vec, 2),
df %>%
group_by(ID) %>%
mutate(v1 = toString(sample(samp.vec, 2)))
# A tibble: 14 x 3
# Groups: ID [6]
# ID Sub v1
# <int> <fct> <chr>
# 1 1 a 2, 3
# 2 1 a 2, 3
# 3 1 a 2, 3
# 4 2 b 1, 4
# 5 2 b 1, 4
# 6 3 c 3, 1
# 7 3 c 3, 1
# 8 4 d 4, 5
# 9 4 d 4, 5
#10 4 d 4, 5
#11 5 f 4, 2
#12 5 f 4, 2
#13 6 g 2, 4
#14 6 g 2, 4
So it will filter the 2 IDs from each group. Thus, sometimes you will have 2, sometimes 3 and sometimes all of them.

how to find the row number of similar strings

My data is huge but I want to know the row number of similar strings
df<- structure(list(x = structure(c(5L, 5L, 5L, 5L, 1L, 1L, 3L, 5L,
5L, 6L, 6L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 3L), .Label = c("AJ5ter2",
"al-1Tter2", "AY9ter2", "CY-Yter2", "LK2ter2", "YY49ter2"), class = "factor")), class = "data.frame", row.names = c(NA,
-19L))
a desire output is shown below
LK2ter2 1:4, 9:10
AJ5ter2 5:6
AY9ter2 7, 19
YY49ter2 10:11
al-1Tter2 12:15
CY-Yter2 16:18
Another option using data.table
library(data.table)
DT <- as.data.table(df)
DT[, .(index = paste(unique(range(.I)), collapse = ":")), by = .(x, rleid(x))
][, .(index = toString(index)), by = x]
# x index
#1: LK2ter2 1:4, 8:9
#2: AJ5ter2 5:6
#3: AY9ter2 7, 19
#4: YY49ter2 10:11
#5: al-1Tter2 12:15
#6: CY-Yter2 16:18
Using tidyverse and data.table you can do:
df %>%
rowid_to_column() %>%
group_by(x, rleid(x)) %>%
summarise(res = ifelse(min(rowid) != max(rowid),
paste(min(rowid), max(rowid), sep = ":"), paste(rowid))) %>%
group_by(x) %>%
summarise(res = paste(res, collapse = ", "))
x res
<fct> <chr>
1 AJ5ter2 5:6
2 al-1Tter2 12:15
3 AY9ter2 7, 19
4 CY-Yter2 16:18
5 LK2ter2 1:4, 8:9
6 YY49ter2 10:11
Or the same with just tidyverse:
df %>%
rowid_to_column() %>%
group_by(x, x_rleid = {x_rleid = rle(as.numeric(x)); rep(seq_along(x_rleid$lengths), x_rleid$lengths)}) %>%
summarise(res = ifelse(min(rowid) != max(rowid),
paste(min(rowid), max(rowid), sep = ":"), paste(rowid))) %>%
group_by(x) %>%
summarise(res = paste(res, collapse = ", "))
Both codes, first, add a column with row ID. Second, they group by "x" and the run-length group ID of "x". Third, they assess whether the minimum row ID is equal to maximum row ID. If not, they combine the value of minimum and maximum row ID, separated by :, otherwise use just a single row ID value. Finally, they group by just "x" and combines the different elements by ,.
Or if you want all the values, not just the ranges:
df %>%
rowid_to_column() %>%
group_by(x, x_rleid = {x_rleid = rle(as.numeric(x)); rep(seq_along(x_rleid$lengths), x_rleid$lengths)}) %>%
summarise(res = paste(rowid, collapse = ",")) %>%
group_by(x) %>%
summarise(res = paste(res, collapse = ","))
x res
<fct> <chr>
1 AJ5ter2 5,6
2 al-1Tter2 12,13,14,15
3 AY9ter2 7,19
4 CY-Yter2 16,17,18
5 LK2ter2 1,2,3,4,8,9
6 YY49ter2 10,11
Here's one way with dplyr methods. Not sure if you want text output or a numeric vector
library(tidyverse)
df <- structure(list(x = structure(c(5L, 5L, 5L, 5L, 1L, 1L, 3L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 3L), .Label = c("AJ5ter2", "al-1Tter2", "AY9ter2", "CY-Yter2", "LK2ter2", "YY49ter2"), class = "factor")), class = "data.frame", row.names = c(NA, -19L))
df %>%
mutate(row_number = row_number()) %>%
group_by(x) %>%
summarise(row_nums = str_c(row_number, collapse = ","))
#> # A tibble: 6 x 2
#> x row_nums
#> <fct> <chr>
#> 1 AJ5ter2 5,6
#> 2 al-1Tter2 12,13,14,15
#> 3 AY9ter2 7,19
#> 4 CY-Yter2 16,17,18
#> 5 LK2ter2 1,2,3,4,8,9
#> 6 YY49ter2 10,11
Created on 2019-02-19 by the reprex package (v0.2.1)
You could try something like:
z <- sapply(levels(df$x), function(x) which(x == df$x))
data.frame(key = names(z), index = sapply(z, paste, collapse = ", "), row.names = NULL)
key index
1 AJ5ter2 5, 6
2 al-1Tter2 12, 13, 14, 15
3 AY9ter2 7, 19
4 CY-Yter2 16, 17, 18
5 LK2ter2 1, 2, 3, 4, 8, 9
6 YY49ter2 10, 11

Spread every other row then unite to append row names in dplyr

I am in the process of trying to make untidy data data. I have data in the following format:
name x
a NA
value 1
b NA
value 2
c NA
value 3
I would like it to be in the following format
name x
a_value 1
b_value 2
c_value 3
How can I do this in dplyr?
My first thought is to come up with a way to spread so that
name name2 x x2
a value NA 1
b value NA 2
c value NA 3
From there I know I can use unite for name and name2 and delete column x, but I am not sure if spread can produce the above.
You can group on NA and summarise, i.e.
library(dplyr)
df %>%
group_by(grp = cumsum(is.na(x))) %>%
summarise(name = paste(name, collapse = '_'))
which gives,
# A tibble: 3 x 2
grp name
<int> <chr>
1 1 a_value
2 2 b_value
3 3 c_value
DATA
dput(df)
structure(list(name = c("a", "value", "b", "value", "c", "value"
), x = c(NA, 1L, NA, 2L, NA, 3L)), .Names = c("name", "x"), row.names = c(NA,
-6L), class = "data.frame")
Use na.locf and then remove the unwanted rows:
library(dplyr)
library(zoo)
DF %>%
mutate(x = na.locf(x, fromLast = TRUE)) %>%
filter(name != "value")
giving:
name x
1 a 1
2 b 2
3 c 3
Note
DF <-
structure(list(name = structure(c(1L, 4L, 2L, 4L, 3L, 4L), .Label = c("a",
"b", "c", "value"), class = "factor"), x = c(NA, 1L, NA, 2L,
NA, 3L)), .Names = c("name", "x"), class = "data.frame", row.names = c(NA,
-6L))

R: Unique count by first occurrence of grouping variable

I would like to create a new variable "Count" that is a count of the unique values of a factor "Period", by grouping variable "ID". The following data includes a column with the values I would want in "Count":
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("a", "b"), class = "factor"), Period = c(1.1, 1.1,
1.2, 1.3, 1.2, 1.3, 1.5, 1.5), Count = c(1L, 1L, 2L, 3L, 1L,
2L, 3L, 3L)), .Names = c("ID", "Period", "Count"), class = "data.frame", row.names = c(NA,
-8L))
I tried to use mutate with Count = 1:length(Period) but it creates a cumulative count of each value of "Period", whereas I want a cumulative count of only unique values. This is what I tried:
library(plyr)
samp1<-ddply(samp, .(ID, Period), mutate, Count = 1:length(Period))
Could anyone provide the correct function to use?
Edit- New answer
Now that come to think of it some more, my initial approach won't return correct results if each groups elements aren't grouped together, so for example for
v <- c(1, 3, 2, 2, 1, 2)
My function will put non-consecutive 1s and 2 in different groups
myrleid(v)
## [1] 1 2 3 3 4 5
Thus, the best approach seem to be
match(v, unique(v))
## [1] 1 2 3 3 1 3
Will will both preserve the appearance order and keep un-ordered values in the same group.
Thus, I would recommend just doing
library(data.table)
setDT(df)[, Count2 := match(Period, unique(Period)), by = ID]
or (with base R)
with(df, ave(Period, ID, FUN = function(x) match(x, unique(x))))
Old answer
Looks like a good candidate for the rleid function from the data.table devel version on GH
### Devel version installation instructions
# library(devtools)
# install_github("Rdatatable/data.table", build_vignettes = FALSE)
library(data.table) # v 1.9.5+
setDT(df)[, Count2 := rleid(Period), by = ID]
df
# ID Period Count Count2
# 1: a 1.1 1 1
# 2: a 1.1 1 1
# 3: a 1.2 2 2
# 4: a 1.3 3 3
# 5: b 1.2 1 1
# 6: b 1.3 2 2
# 7: b 1.5 3 3
# 8: b 1.5 3 3
Or, If you don't want to load external packages, we could define this function on our own
myrleid <- function(x) {
temp <- rle(x)$lengths
rep.int(seq_along(temp), temp)
}
with(df, ave(Period, ID, FUN = myrleid))
## [1] 1 1 2 3 1 2 3 3
Or if the groups are in increasing order, you could try ranking them too
library(data.table) ## V1.9.5+
setDT(df)[, Count2 := frank(Period, ties.method = "dense"), by = ID]
Or
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Count2 = dense_rank(Period))
samp <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("a", "b"), class = "factor"), Period = c(1.1, 1.1,
1.2, 1.3, 1.2, 1.3, 1.5, 1.5), Count = c(1L, 1L, 2L, 3L, 1L,
2L, 3L, 3L)), .Names = c("ID", "Period", "Count"), class = "data.frame", row.names = c(NA,
-8L))
select(samp, -Count) %>%
arrange(ID, Period) %>%
group_by(ID) %>%
mutate(dup = !duplicated(Period),
Count = cumsum(dup))
The key steps are to arrange by ID and Period, and then to identify that first new representation of Period as "not duplicated".
A solution in base R with transform:
transform(df, Count2 = unlist(
tapply(df$Period, df$ID, function(x)
as.numeric(factor(x)))
))
ID Period Count Count2
a1 a 1.1 1 1
a2 a 1.1 1 1
a3 a 1.2 2 2
a4 a 1.3 3 3
b1 b 1.2 1 1
b2 b 1.3 2 2
b3 b 1.5 3 3
b4 b 1.5 3 3
as David suggested this solution does not work well if data Period are not monotonic increasing.

Resources