complete sequence column names and fill R - r

I have large data similar to the following:
week_0<-c(5,0,1,0,8,1)
week_4<-c(1,0,1,0,1,1)
week_8<-c(1,0,6,0,0,0)
week_9<-c(2,4,1,7,8,1)
week_10<-c(2,4,1,7,8,1)
Participant<-c("Lion","Cat","Dog","Snake","Tiger","Mouse")
test_data<-data.frame(Participant,week_0,week_4,week_8,week_9,week_10)
> test_data
Participant week_0 week_4 week_8 week_9 week_10
1 Lion 5 1 1 2 2
2 Cat 0 0 0 4 4
3 Dog 1 1 6 1 1
4 Snake 0 0 0 7 7
5 Tiger 8 1 0 8 8
6 Mouse 1 1 0 1 1
I want to fill out the gap between the columnnames numbers. The end result that I'm looking for is:
test_data
Participant week_0 week_1 week_2 week_3 week_4 week_5 week_6 week_7 week_8 week_9 week_10
1 Lion 5 5 5 5 1 1 1 1 1 2 2
2 Cat 0 0 0 0 0 0 0 0 0 4 4
3 Dog 1 1 1 1 1 1 1 1 6 1 1
4 Snake 0 0 0 0 0 0 0 0 0 7 7
5 Tiger 8 8 8 8 1 1 1 1 0 8 8
6 Mouse 1 1 1 1 1 1 1 1 0 1 1
I have looked at the Fill function in r, but I can't get the result that I want.
Any suggestions on how to do this?

Using base R - extract the numeric suffix part from the 'week' column names, then get a sequence between the min/max values ('i2'), replicate the columns based on matching the indexes and rename the column names with i2
i1 <- as.integer(sub("week_", "", names(test_data)[-1]))
i2 <- Reduce(`:`, as.list(range(i1)))
test_data <- cbind(test_data[1], test_data[-1][cumsum(!is.na(match(i2, i1)))])
names(test_data)[-1] <- paste0("week_", i2)
-output
> test_data
Participant week_0 week_1 week_2 week_3 week_4 week_5 week_6 week_7 week_8 week_9 week_10
1 Lion 5 5 5 5 1 1 1 1 1 2 2
2 Cat 0 0 0 0 0 0 0 0 0 4 4
3 Dog 1 1 1 1 1 1 1 1 6 1 1
4 Snake 0 0 0 0 0 0 0 0 0 7 7
5 Tiger 8 8 8 8 1 1 1 1 0 8 8
6 Mouse 1 1 1 1 1 1 1 1 0 1 1
With tidyverse, an option is to reshape to 'long' with pivot_longer, use complete to expand the data, fill the missing values with previous non-NA, and reshape back to 'wide' with pivot_wider
library(dplyr)
library(tidyr)
test_data %>%
pivot_longer(cols = starts_with('week_'),
names_prefix = "week_", names_transform = as.integer) %>%
complete(Participant, name = full_seq(name, period = 1)) %>%
fill(value, .direction = "downup") %>%
pivot_wider(names_from = name, values_from = value,
names_prefix = "week_") %>%
arrange(match(Participant, test_data$Participant))
-output
# A tibble: 6 × 12
Participant week_0 week_1 week_2 week_3 week_4 week_5 week_6 week_7 week_8 week_9 week_10
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Lion 5 5 5 5 1 1 1 1 1 2 2
2 Cat 0 0 0 0 0 0 0 0 0 4 4
3 Dog 1 1 1 1 1 1 1 1 6 1 1
4 Snake 0 0 0 0 0 0 0 0 0 7 7
5 Tiger 8 8 8 8 1 1 1 1 0 8 8
6 Mouse 1 1 1 1 1 1 1 1 0 1 1

Please check the below code
test_data<-data.frame(Participant,week_0,week_4,week_8,week_9,week_10) %>%
pivot_longer(starts_with('week'), names_to = 'name', values_to = 'value') %>%
mutate(seq=as.numeric(str_replace_all(name,'\\w*\\_',''))) %>% arrange(Participant)
seq <- data.frame(Participant=rep(unique(Participant),11)) %>% group_by(Participant) %>%
mutate(seq=row_number(), seq=seq-1) %>%
arrange(Participant)
test_data2 <- test_data %>% right_join(seq, by=c('Participant','seq')) %>%
arrange(Participant) %>%
mutate(name=ifelse(is.na(name),paste0('week_',seq),name)) %>% arrange(Participant,seq) %>%
group_by(Participant) %>%
fill(value) %>%
pivot_wider(Participant, names_from = name, values_from = value)
Created on 2023-01-28 with reprex v2.0.2
# A tibble: 6 × 11
# Groups: Participant [6]
Participant week_0 week_2 week_3 week_4 week_5 week_6 week_7 week_8 week_9 week_10
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Cat 0 0 0 0 0 0 0 0 4 4
2 Dog 1 1 1 1 1 1 1 6 1 1
3 Lion 5 5 5 1 1 1 1 1 2 2
4 Mouse 1 1 1 1 1 1 1 0 1 1
5 Snake 0 0 0 0 0 0 0 0 7 7
6 Tiger 8 8 8 1 1 1 1 0 8 8

Related

extract duplicate row based on condition across column in R

I'm stuck trying to keep row based on condition in R. I want to keep row of data based on the same condition across a large number of columns. So in the below example I want to keep rows from duplicated rows where hv value '0' at each column.
here is the data frame:
ID A B C
1 001 1 1 1
2 002 0 1 0
3 002 1 0 0
4 003 0 1 1
5 003 1 0 1
6 003 0 0 1
I want get like this:
ID A B C
1 001 1 1 1
2 002 0 0 0
3 003 0 0 1
Any help would be much appreciated, thanks!
Please check this code
# A tibble: 6 × 4
ID A B C
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 2 0 1 0
3 2 1 0 0
4 3 0 1 1
5 3 1 0 1
6 3 0 0 1
code
data2 <- data %>% group_by(ID) %>%
mutate(across(c('A','B','C'), ~ ifelse(.x==0, 0, NA), .names = 'x{col}')) %>%
fill(xA, xB, xC) %>%
mutate(across(c('xA','xB','xC'), ~ ifelse(is.na(.x), 1, .x))) %>%
ungroup() %>% group_by(ID) %>% slice_tail(n=1)
output
# A tibble: 3 × 7
# Groups: ID [3]
ID A B C xA xB xC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1 1 1
2 2 1 0 0 0 0 0
3 3 0 0 1 0 0 1

how to cumulative sum variable by unique values and input back in

I'm looking to do the following -- cumulative sum the indicator values and remove the indicators after those days
original:
transaction
day
indicator
1
1
0
1
2
0
1
3
0
1
4
1
1
5
1
1
6
1
2
1
0
2
2
0
2
3
0
2
4
0
2
5
1
2
6
1
and make the new table like this --
transaction
day
indicator
1
1
0
1
2
0
1
3
0
1
4
3
2
1
0
2
2
0
2
3
0
2
4
0
2
5
2
Change all day with indicator == 1 to the first day with indicator == 1
df%>%
group_by(transaction)%>%
mutate(day=case_when(indicator==0~day,
T~head(day[indicator==1],1)))%>%
group_by(transaction,day)%>%
summarise(indicator=sum(indicator))%>%
ungroup
transaction day indicator
<int> <int> <int>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 3
5 2 1 0
6 2 2 0
7 2 3 0
8 2 4 0
9 2 5 2
Please try the below code
code
df <- bind_rows(df1, df2) %>% group_by(transaction) %>%
mutate(cumsum=cumsum(indicator), cumsum2=ifelse(cumsum==1, day, NA)) %>%
fill(cumsum2) %>%
mutate(day=ifelse(!is.na(cumsum2), cumsum2, day)) %>%
group_by(transaction, day) %>% slice_tail(n=1) %>% select(-cumsum2)
Created on 2023-01-19 with reprex v2.0.2
output
# A tibble: 8 × 4
# Groups: transaction, day [8]
transaction day indicator cumsum
<dbl> <int> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 0 0
4 1 4 1 3
5 2 1 0 0
6 2 2 0 0
7 2 3 0 0
8 2 4 1 2
Another approach to try. After grouping by transaction, change indicator to either 0 (same) or the sum of indicator. Finally, keep or filter previous rows where cumall (cumulative all) values for indicator are 0. Using lag will provide the last row containing the sum.
library(tidyverse)
df %>%
group_by(transaction) %>%
mutate(indicator = ifelse(indicator == 0, 0, sum(indicator))) %>%
filter(cumall(lag(indicator, default = 0) == 0))
Output
transaction day indicator
<int> <int> <dbl>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 3
5 2 1 0
6 2 2 0
7 2 3 0
8 2 4 0
9 2 5 2

How can I create a columns with the values in other column (R)?

I would like to create a table that assigns a column to each value of one column.
The data looks like this:
Person Task
John 4
Michael 1
Florence 3
Expected result:
Person Task 1 2 3 4 5 6 7 8
John 4 1 1 1 1 0 0 0 0
Michael 1 0 0 0 0 1 0 0 0
Florence 3 0 0 0 0 0 1 1 1
It's important that the column values ​​are filled in an orderly manner. The first row, then the second and so on.
Thanks!
in base R:
cbind(df, t(unname(model.matrix(~with(df, factor(rep(Person, Task), Person))-1))))
Person Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
For easier code:
Create a dataframe that lookes like below:
df1 <- with(df, data.frame(lengths = Task, values = factor(Person, Person)))
df1
lengths values
1 4 John
2 1 Michael
3 3 Florence
Note that values is now a factor column with levels same as the values.
Then you could simply do:
cbind(df, t(unname(model.matrix(~inverse.rle(df1)-1))))
Person Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
You could use
library(dplyr)
library(tidyr)
df %>%
uncount(Task, .remove = FALSE) %>%
mutate(rn = row_number(),
value = 1) %>%
pivot_wider(c(Person, Task),
names_from = rn,
values_from = value,
values_fill = 0)
This returns
# A tibble: 3 x 10
Person Task `1` `2` `3` `4` `5` `6` `7` `8`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
Setting up your dataframe:
> df <- data.frame(Name=factor(c("John", "Michael", "Florence"), levels=c("John", "Michael", "Florence")), Task=c(4,1,3))
> df
Name Task
1 John 4
2 Michael 1
3 Florence 3
First I will make a 'long' dataframe, expanding each name and task by the number of entries needed. The id will make sure that when I reshape the dataframe wide the columns have the correct names:
df2 <- data.frame(Name=rep(df$Name, df$Task),
Task=rep(df$Task, df$Task),
id = 1:sum(df$Task))
> df2
Name Task id
1 John 4 1
2 John 4 2
3 John 4 3
4 John 4 4
5 Michael 1 5
6 Florence 3 6
7 Florence 3 7
8 Florence 3 8
Now I can reshape wide using the dcast function from reshape2
reshape2::dcast(df2, Name+Task ~ id, fun.aggregate = length, value.var="id")
Name Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
A solution with diag(), since the added values have the shape of a diagonal matrix, just with repeated columns:
n <- length(dat$Task)
cbind( dat, matrix( unlist( apply( rbind(as.integer(dat$Task), 1:n), 2,
function(x) rep(diag(n)[,x[2]], x[1]) ) ), n ) )
Person Task 1 2 3 4 5 6 7 8
1 John 4 1 1 1 1 0 0 0 0
2 Michael 1 0 0 0 0 1 0 0 0
3 Florence 3 0 0 0 0 0 1 1 1
Data:
dat <- structure(list(Person = c("John", "Michael", "Florence"), Task = c(4L,
1L, 3L)), class = "data.frame", row.names = c(NA, -3L))

How to get the number of consecutive zeroes from a column in a dataframe

I'm trying to work out how to get the number of consecutive zeroes for a given column for a dataframe.
Here is a dataframe:
data <- data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2), value = c(1,0,0,1,0,0,0,0,0,0,4,3))
This would be the desired output:
id value consec
1 1 0
1 0 2
1 0 2
1 1 0
1 0 2
1 0 2
2 0 4
2 0 4
2 0 4
2 0 4
2 4 0
2 3 0
Any ideas on how to achieve this output?
Many thanks
You can do:
data$consec <- with(data, ave(value, value, cumsum(value != 0), id, FUN = length) - (value != 0))
data
id value consec
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0
Here's a base R solution using interaction and rle (run-length encoding):
rlid <- rle(as.numeric(interaction(data$id, data$value)))$lengths
data$consec <- replace(rep(rlid, rlid), data$value != 0, 0)
data
#> id value consec
#> 1 1 1 0
#> 2 1 0 2
#> 3 1 0 2
#> 4 1 1 0
#> 5 1 0 2
#> 6 1 0 2
#> 7 2 0 4
#> 8 2 0 4
#> 9 2 0 4
#> 10 2 0 4
#> 11 2 4 0
#> 12 2 3 0
This dplyr solution will work. Using cumulative sum we keep track of every time a new non-zero entry occurs, and for each of these groups we count the number of zeros:
data %>%
group_by(id) %>%
mutate(flag_0 = cumsum(value == 1)) %>%
group_by(id, flag_0) %>%
mutate(conseq = ifelse(value == 0, sum(value == 0), 0)) %>%
ungroup()
# A tibble: 12 x 4
id value flag_0 conseq
<dbl> <dbl> <int> <dbl>
1 1 1 1 0
2 1 0 1 2
3 1 0 1 2
4 1 1 2 0
5 1 0 2 2
6 1 0 2 2
7 2 0 0 4
8 2 0 0 4
9 2 0 0 4
10 2 0 0 4
11 2 4 0 0
12 2 3 0 0
This tidyverse approach can also do the job
library(tidyverse)
data %>% group_by(id) %>%
mutate(value2 =cumsum(value)) %>% group_by(id, value, value2) %>%
mutate(consec = ifelse(value == 0, n(), 0)) %>%
ungroup() %>% select(-value2)
# A tibble: 12 x 3
id value consec
<dbl> <dbl> <dbl>
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0

fill values between interval grouped by ID

I have a data set where subjects have a value of 1 or 0 at different times. I need a function or a piece of code to that feels with 1, the values of 0 between the first and last 1.
I have tried complete() and fill() but not doing what I want
I have the following data:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0))
# A tibble: 30 x 3
ID TIME DV
<dbl> <dbl> <dbl>
1 1 1 0
2 1 2 0
3 1 3 1
4 1 4 1
5 1 5 0
6 1 6 0
7 1 7 1
8 1 8 0
9 1 9 0
10 1 10 0
# ... with 20 more rows
I need the following output as shown in DV2:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0),
DV2 = c(0,0,1,1,1,1,1,0,0,0,
0,1,1,1,1,1,1,1,1,1,
0,1,1,1,1,1,1,1,0,0))
# A tibble: 30 x 4
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
# ... with 20 more rows
With dplyr, you can do:
dat %>%
rowid_to_column() %>%
group_by(ID) %>%
mutate(DV2 = if_else(rowid %in% min(rowid[DV == 1]):max(rowid[DV == 1]),
1, 0)) %>%
ungroup() %>%
select(-rowid)
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
We can create a helper function, and apply it on every group, i.e.
f1 <- function(x) {
v1 <- which(x == 1)
x[v1[1]:v1[length(v1)]] <- 1
return(x)
}
with(dat, ave(DV, ID, FUN = f1))
#[1] 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0

Resources