R — Assign value to vector based on first episode - r

So I have a sequence dataset that looks like this
id epnum clockst
1 1 1 0
2 1 2 1
3 1 3 2
4 2 1 4
5 2 2 5
6 2 3 6
7 3 1 4
8 3 2 5
9 3 3 6
What I want is to create a vector of clockst based on epnum == 1.
So, I want basically this
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
However, I struggle to do so.
I came up with this, but it doesn't fully work.
dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0,
ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))
Any idea?
Data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0",
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id",
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")

Here is a solution using tidyverse:
First check the condition epnum == 1 and if TRUE, use clockst value if not NA. Then just fill NA with previous values.
Since clockst is a factor one needs to convert it to numeric while keeping the same values so as.numeric(as.character( needs to be used.
library(tidyverse)
dt %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down")
#output:
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
Here is a quick comparison of the available answers. I chose to use a 90 k row data set:
df <- df[rep(1:nrow(df), times = 10000),] #where df = dt
dt <- data.table(df)
library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
missuse = df %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down"),
d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
www = df %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup())
plot(bench)
with a 900 k row data set:
oh man I really need to learn DT.

Another tidyverse solution. arrange is not required if you are certain that the rows are in the right order.
library(dplyr)
dt2 <- dt %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup()
dt2
# # A tibble: 9 x 4
# id epnum clockst ep_start
# <fctr> <fctr> <fctr> <fctr>
# 1 1 1 0 0
# 2 1 2 1 0
# 3 1 3 2 0
# 4 2 1 4 4
# 5 2 2 5 4
# 6 2 3 6 4
# 7 3 1 4 4
# 8 3 2 5 4
# 9 3 3 6 4

You can do this with library(data.table) as follows
T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]
This gives:
id epnum clockst ep_start
1: 1 1 0 0
2: 1 2 1 0
3: 1 3 2 0
4: 2 1 4 4
5: 2 2 5 4
6: 2 3 6 4
7: 3 1 4 4
8: 3 2 5 4
9: 3 3 6 4

dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
# id epnum clockst ep_start
#1 1 1 0 0
#2 1 2 1 0
#3 1 3 2 0
#4 2 1 4 4
#5 2 2 5 4
#6 2 3 6 4
#7 3 1 4 4
#8 3 2 5 4
#9 3 3 6 4

Using match
clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]

Related

How to add new rows conditionally on R

I have a df with
v1 t1 c1 o1
1 1 9 1
1 1 12 2
1 2 2 1
1 2 7 2
2 1 3 1
2 1 6 2
2 2 3 1
2 2 12 2
And I would like to add 2 rows each time that v1 changes it's value, in order to get this:
v1 t1 c1 o1
1 1 1 1
1 1 1 2
1 2 9 1
1 2 12 2
1 3 2 1
1 3 7 2
2 1 1 1
2 1 1 2
1 2 3 1
1 2 6 2
2 3 3 1
2 3 12 2
So what I'm doing is that every time v1 changes its value I'm adding 2 rows of ones and adding a 1 to the values of t1. This is kind of tricky. I've been able to do it in Excel but I would like to scale to big files in R.
We may do the expansion in group_modify
library(dplyr)
df1 %>%
group_by(v1) %>%
group_modify(~ .x %>%
slice_head(n = 2) %>%
mutate(across(-o1, ~ 1)) %>%
bind_rows(.x) %>%
mutate(t1 = as.integer(gl(n(), 2, n())))) %>%
ungroup
-output
# A tibble: 12 × 4
v1 t1 c1 o1
<int> <int> <dbl> <int>
1 1 1 1 1
2 1 1 1 2
3 1 2 9 1
4 1 2 12 2
5 1 3 2 1
6 1 3 7 2
7 2 1 1 1
8 2 1 1 2
9 2 2 3 1
10 2 2 6 2
11 2 3 3 1
12 2 3 12 2
Or do a group by summarise
df1 %>%
group_by(v1) %>%
summarise(t1 = as.integer(gl(n() + 2, 2, n() + 2)),
c1 = c(1, 1, c1), o1 = rep(1:2, length.out = n() + 2),
.groups = 'drop')
-output
# A tibble: 12 × 4
v1 t1 c1 o1
<int> <int> <dbl> <int>
1 1 1 1 1
2 1 1 1 2
3 1 2 9 1
4 1 2 12 2
5 1 3 2 1
6 1 3 7 2
7 2 1 1 1
8 2 1 1 2
9 2 2 3 1
10 2 2 6 2
11 2 3 3 1
12 2 3 12 2
data
df1 <- structure(list(v1 = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), t1 = c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), c1 = c(9L, 12L, 2L, 7L, 3L, 6L,
3L, 12L), o1 = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L)),
class = "data.frame", row.names = c(NA,
-8L))

Mutate with str_detect only if condition is true

Im trying to use str_detect to mutate only if the column "RedColor" is "1".
I have a dataset test which looks like this:
# id RedColor Color_Number
#1 1 1 1
#2 2 0 1
#3 3 1 3
#4 4 1 2
#5 6 0 2
#6 8 1 6
I tried the filter function but it returns me only a filtered dataset with all other cases with RedColor = "0" removed.
test <- test %>%
filter(RedColor==TRUE) %>%
mutate(DarkRed = str_detect(Color_Number, "1|2"))
Im expecting an output with the new column DarkRed = "1" in all cases with RedColor = 1 and 1 or 2 in column Color_Number.
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
Thank you!
Using base R
transform(df, Dark_Red = +(RedColor == 1& Color_Number %in% 1:2))
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 0
6 8 1 6 0
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)), row.names = c(NA,
-6L), class = "data.frame")
Update on OP's request (see comments):
With this dataframe:
id RedColor Color_Number
1 1 1 one
2 2 0 one
3 3 1 three
4 4 1 two
5 6 0 two
6 8 1 six
you could use this code:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == "one" | Color_Number == "two", 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 one 1
2 2 0 one 0
3 3 1 three 0
4 4 1 two 1
5 6 0 two 1
6 8 1 six 0
First answer:
We could use ifelse
str_detect is not appropriate as Ronak already explained:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == 1 | Color_Number == 2, 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 1
6 8 1 6 0
For exact matches don't perform regex match. str_detect is used for pattern matching. Use %in% to match multiple values.
library(dplyr)
df <- df %>% mutate(Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
df
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
If you want to write this in base R use transform -
df <- transform(df, Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)),
row.names = c(NA, -6L), class = "data.frame")
you can use ifelse inside the mutate call instead of filtering:
test <- test %>%
mutate(Darkred=ifelse((RedColor==TRUE & Color_Number %in% 1:2), 1,0))
> test
# A tibble: 10 × 4
id RedColor Color_Number Darkred
<int> <int> <int> <dbl>
1 1 1 2 1
2 2 1 2 1
3 3 1 3 0
4 4 1 3 0
5 5 0 4 0
6 6 0 2 0
7 7 1 3 0
8 8 1 4 0
9 9 0 5 0
10 10 0 3 0
Data:
test<-data_frame(id=1:10,
RedColor=rbinom(10,1,0.5),
Color_Number=sample(1:5,10,TRUE,rep(.2,5)))

Count number of observations by group

I'm trying to count the number of every observation for each variable in a dataset regarding a specific group.
The data looks like this:
grp v1 vn
1 2 5
2 4
3 3 4
1 3
1 2 12
4 5
5 3 6
5 6
The Result should be a table like this:
grp v1 vn
1 2 3
2 1 0
3 1 1
4 0 1
5 2 1
I tried to use
x %>% group_by(grp) %>% summarise(across(everything(),n = n()))
but it didn`t really worked.
Any help is appreciated. Thanks in advance!
You can also use the following solution:
library(dplyr)
df %>%
group_by(grp) %>%
summarise(across(v1:vn, ~ sum(!is.na(.x))))
# A tibble: 5 x 3
grp v1 vn
<int> <int> <int>
1 1 2 3
2 2 1 0
3 3 1 1
4 4 0 1
5 5 2 1
Get the data in long format, count non-NA values for each column in each group and get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -grp) %>%
group_by(grp, name) %>%
summarise(n = sum(!is.na(value))) %>%
ungroup %>%
pivot_wider(names_from = name, values_from = n)
# grp v1 vn
# <int> <int> <int>
#1 1 2 3
#2 2 1 0
#3 3 1 1
#4 4 0 1
#5 5 2 1
data
df <- structure(list(grp = c(1L, 2L, 3L, 1L, 1L, 4L, 5L, 5L), v1 = c(2L,
4L, 3L, NA, 2L, NA, 3L, 6L), vn = c(5L, NA, 4L, 3L, 2L, 5L, 6L,
NA)), class = "data.frame", row.names = c(NA, -8L))
Using data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) sum(!is.na(x))), grp]
# grp v1 vn
#1: 1 2 3
#2: 2 1 0
#3: 3 1 1
#4: 4 0 1
#5: 5 2 1
Using aggregate.
aggregate(cbind(v1, vn) ~ grp, replace(dat, is.na(dat), 0), function(x) sum(as.logical(x)))
# grp v1 vn
# 1 1 2 3
# 2 2 1 0
# 3 3 1 1
# 4 4 0 1
# 5 5 2 1
Data:
dat <- read.table(header=T, text='grp v1 vn
1 2 5
2 4 NA
3 3 4
1 NA 3
1 2 12
4 NA 5
5 3 6
5 6 NA
')

By group relative order

I have a data set that looks like this
ID
Week
1
3
1
5
1
5
1
8
1
11
1
16
2
2
2
2
2
3
2
3
2
9
Now, what I would like to do is to add another column to the DataFrame so that, for every ID I will mark the week's relative position. More elaborately, I would like to the mark ID's earliest week (smallest number) as 1, then the next week for the ID as 2 and so forth, where if there are two observations of the same week they get the same number.
So, in the above example I should get:
ID
Week
Order
1
3
1
1
5
2
1
5
2
1
8
3
1
11
4
1
16
5
2
2
1
2
2
1
2
3
2
2
3
2
2
9
3
How could I achieve this?
Thank you very much!
A base R option using ave + match
transform(
df,
Order = ave(Week,
ID,
FUN = function(x) match(x, sort(unique(x)))
)
)
or ave + order (thank #IRTFM for comments)
transform(
df,
Order = ave(Week,
ID,
FUN = order
)
)
gives
ID Week Order
1 1 3 1
2 1 5 2
3 1 5 2
4 1 8 3
5 1 11 4
6 1 16 5
7 2 2 1
8 2 2 1
9 2 3 2
10 2 3 2
11 2 9 3
A data.table option with frank
> setDT(df)[, Order := frank(Week, ties.method = "dense"), ID][]
ID Week Order
1: 1 3 1
2: 1 5 2
3: 1 5 2
4: 1 8 3
5: 1 11 4
6: 1 16 5
7: 2 2 1
8: 2 2 1
9: 2 3 2
10: 2 3 2
11: 2 9 3
Data
> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), Week = c(3L, 5L, 5L, 8L, 11L, 16L, 2L, 2L, 3L, 3L, 9L)), class = "data.frame", row.names =
c(NA,
-11L))
You can use dense_rank in dplyr :
library(dplyr)
df %>% group_by(ID) %>% mutate(Order = dense_rank(Week)) %>% ungroup
# ID Week Order
# <int> <int> <int>
# 1 1 3 1
# 2 1 5 2
# 3 1 5 2
# 4 1 8 3
# 5 1 11 4
# 6 1 16 5
# 7 2 2 1
# 8 2 2 1
# 9 2 3 2
#10 2 3 2
#11 2 9 3

applying sorting function in r for every four rows returns dataframe sorted but without extended selection, other columns are not sorted accordingly

I need every four rows to be sorted by the 4th column, separately from the next four rows, made a function :
for (i in seq(1,nrow(data_frame), by=4)) {
data_frame[i:(i+3),4] <- sort(data_frame[i:(i+3),4], decreasing=TRUE) }
problem is only the 4th column gets sorted but the corresponding rows are maintained.
from
x y z userID
-1 1 2 5 1
-2 1 1 2 2
-3 0 0 5 5
-6 1 2 5 3
-4 1 1 2 6
-5 0 0 5 4
-4 1 1 2 1
-5 0 0 5 5
to -
x y z userID
-1 1 2 5 5
-2 1 1 2 3
-3 0 0 5 2
-6 1 2 5 1
-4 1 1 2 6
-5 0 0 5 5
-4 1 1 2 4
-5 0 0 5 1
With tidyverse, we can use %/% to create a grouping column with %/% and use that to sort the 'userID'
library(tidyverse)
df1 %>%
group_by(grp = (row_number()-1) %/% 4 + 1) %>%
#or use
#group_by(grp = cumsum(rep(c(TRUE, FALSE, FALSE, FALSE), length.out = n()))) %>%
mutate(userID = sort(userID, decreasing = TRUE))
# A tibble: 8 x 5
# Groups: grp [2]
# x y z userID grp
# <int> <int> <int> <int> <dbl>
#1 1 2 5 5 1
#2 1 1 2 3 1
#3 0 0 5 2 1
#4 1 2 5 1 1
#5 1 1 2 6 2
#6 0 0 5 5 2
#7 1 1 2 4 2
#8 0 0 5 1 2
Or using base R with ave
with(df1, ave(userID, (seq_along(userID)-1) %/% 4 + 1,
FUN = function(x) sort(x, decreasing = TRUE)))
#[1] 5 3 2 1 6 5 4 1
data
df1 <- structure(list(x = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L), y = c(2L,
1L, 0L, 2L, 1L, 0L, 1L, 0L), z = c(5L, 2L, 5L, 5L, 2L, 5L, 2L,
5L), userID = c(1L, 2L, 5L, 3L, 6L, 4L, 1L, 5L)), row.names = c(NA,
-8L), class = "data.frame")
In base R, we can split every 4 rows, order the fourth column and return the updated dataframe back.
df[] <- do.call(rbind, lapply(split(df, gl(nrow(df)/4, 4)),
function(p) p[order(p[[4]], decreasing = TRUE), ]))
df
# x y z userID
#1 0 0 5 5
#2 1 2 5 3
#3 1 1 2 2
#4 1 2 5 1
#5 1 1 2 6
#6 0 0 5 5
#7 0 0 5 4
#8 1 1 2 1
tidyverse approach using the same logic would be
library(tidyverse)
df %>%
group_split(gl(n()/4, 4), keep = FALSE) %>%
map_dfr(. %>% arrange(desc(userID)))

Resources