Spliting rows into multiple rows and adding NA when needed - r

I have the following dataframe:
A B C D E F G
1/2 3/4 4/5/6 7/8 9/10 11 12/13/14/15
And want to split to:
A B C D E F G
1 3 4 7 9 11 12
2 4 5 8 10 NA 13
NA NA 6 NA NA NA 14
NA NA NA NA NA NA 15
Is there any compact way to do it?
I've tought about separating each column into a list, using something such as
list_of_dfs
and for each df do:
modified_dfs %>% separate_rows(colnames(each_df), sep = "/")
then doing a merge of all dataframes created in the process...
merge(modified_dfs)

It is more easier with cSplit
library(splitstackshape)
cSplit(df1, names(df1), sep = "/", "long")
-output
A B C D E F G
<int> <int> <int> <int> <int> <int> <int>
1: 1 3 4 7 9 11 12
2: 2 4 5 8 10 NA 13
3: NA NA 6 NA NA NA 14
4: NA NA NA NA NA NA 15
data
df1 <- structure(list(A = "1/2", B = "3/4", C = "4/5/6", D = "7/8",
E = "9/10", F = 11L, G = "12/13/14/15"), class = "data.frame",
row.names = c(NA,
-1L))

Related

Group_by id and count the consective NA's and then restart counting when a new series of NA's is encountered

I have a dataframe like this:
df <- data_frame(id = c(rep('A', 10), rep('B', 10)),
value = c(1:3, rep(NA, 2), 1:2, rep(NA, 3), 1, rep(NA, 4), 1:3, rep(NA, 2)))
I need to count the number of consective NA's in the value column. The count needs to be grouped by ID, and it needs to restart at 1 every time a new NA or new series of NA's is encountered. The exptected output should look like this:
df$expected_output <- c(rep(NA, 3), 1:2, rep(NA, 2), 1:3, NA, 1:4, rep(NA, 3), 1:2)
If anyone can give me a dplyr solution that would also be great :)
I've tried a few things but nothing is giving any sort of sensical result. Thanks in advance^!
A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
group_by(id) %>%
mutate(info = rleid(value)) %>%
group_by(id, info) %>%
mutate(expected_output = row_number()) %>%
ungroup() %>%
mutate(expected_output = ifelse(!is.na(value), NA, expected_output)) %>%
select(-info)
df2
# # A tibble: 20 x 3
# id value expected_output
# <chr> <dbl> <int>
# 1 A 1 NA
# 2 A 2 NA
# 3 A 3 NA
# 4 A NA 1
# 5 A NA 2
# 6 A 1 NA
# 7 A 2 NA
# 8 A NA 1
# 9 A NA 2
# 10 A NA 3
# 11 B 1 NA
# 12 B NA 1
# 13 B NA 2
# 14 B NA 3
# 15 B NA 4
# 16 B 1 NA
# 17 B 2 NA
# 18 B 3 NA
# 19 B NA 1
# 20 B NA 2
We can use rle to get length of groups that are or are not na, and use purrr::map2 to apply seq if they are NA and get the growing count or just fill in with NA values using rep.
library(tidyverse)
count_na <- function(x) {
r <- rle(is.na(x))
consec <- map2(r$lengths, r$values, ~ if (.y) seq(.x) else rep(NA, .x))
unlist(consec)
}
df %>%
mutate(expected_output = count_na(value))
#> # A tibble: 20 × 3
#> id value expected_output
#> <chr> <dbl> <int>
#> 1 A 1 NA
#> 2 A 2 NA
#> 3 A 3 NA
#> 4 A NA 1
#> 5 A NA 2
#> 6 A 1 NA
#> 7 A 2 NA
#> 8 A NA 1
#> 9 A NA 2
#> 10 A NA 3
#> 11 B 1 NA
#> 12 B NA 1
#> 13 B NA 2
#> 14 B NA 3
#> 15 B NA 4
#> 16 B 1 NA
#> 17 B 2 NA
#> 18 B 3 NA
#> 19 B NA 1
#> 20 B NA 2
Here is a solution using rle:
x <- rle(is.na(df$value))
df$new[is.na(df$value)] <- sequence(x$lengths[x$values])
# A tibble: 20 x 3
id value new
<chr> <dbl> <int>
1 A 1 NA
2 A 2 NA
3 A 3 NA
4 A NA 1
5 A NA 2
6 A 1 NA
7 A 2 NA
8 A NA 1
9 A NA 2
10 A NA 3
11 B 1 NA
12 B NA 1
13 B NA 2
14 B NA 3
15 B NA 4
16 B 1 NA
17 B 2 NA
18 B 3 NA
19 B NA 1
20 B NA 2
Yet another solution:
library(tidyverse)
df %>%
mutate(aux =data.table::rleid(value)) %>%
group_by(id, aux) %>%
mutate(eout = ifelse(is.na(value), row_number(), NA_real_)) %>%
ungroup %>% select(-aux)
#> # A tibble: 20 × 4
#> id value expected_output eout
#> <chr> <dbl> <int> <dbl>
#> 1 A 1 NA NA
#> 2 A 2 NA NA
#> 3 A 3 NA NA
#> 4 A NA 1 1
#> 5 A NA 2 2
#> 6 A 1 NA NA
#> 7 A 2 NA NA
#> 8 A NA 1 1
#> 9 A NA 2 2
#> 10 A NA 3 3
#> 11 B 1 NA NA
#> 12 B NA 1 1
#> 13 B NA 2 2
#> 14 B NA 3 3
#> 15 B NA 4 4
#> 16 B 1 NA NA
#> 17 B 2 NA NA
#> 18 B 3 NA NA
#> 19 B NA 1 1
#> 20 B NA 2 2

Assign ID to column with NA's

This must be easy but my brain is blocked!
I have this dataframe:
col1
<chr>
1 A
2 B
3 NA
4 C
5 D
6 NA
7 NA
8 E
9 NA
10 F
df <- structure(list(col1 = c("A", "B", NA, "C", "D", NA, NA, "E",
NA, "F")), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
I want to add a column with uniqueID only for values that are not NA with tidyverse.
Expected output:
col1 uniqueID
<chr> <dbl>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
I have tried: n(), row_number(), cur_group_id ....
We could do this easily in data.table. Specify the condition in i i.e. non-NA elements in 'col1', create the column 'uniqueID' with the sequence of elements by assignment (:=)
library(data.table)
setDT(df)[!is.na(col1), uniqueID := seq_len(.N)]
-output
df
col1 uniqueID
1: A 1
2: B 2
3: <NA> NA
4: C 3
5: D 4
6: <NA> NA
7: <NA> NA
8: E 5
9: <NA> NA
10: F 6
In dplyr, we can use replace
library(dplyr)
df %>%
mutate(uniqueID = replace(col1, !is.na(col1),
seq_len(sum(!is.na(col1)))))
-output
# A tibble: 10 x 2
col1 uniqueID
<chr> <chr>
1 A 1
2 B 2
3 <NA> <NA>
4 C 3
5 D 4
6 <NA> <NA>
7 <NA> <NA>
8 E 5
9 <NA> <NA>
10 F 6
Another approach:
library(dplyr)
df %>%
mutate(UniqueID = cumsum(!is.na(col1)),
UniqueID = if_else(is.na(col1), NA_integer_, UniqueID))
# A tibble: 10 x 2
col1 UniqueID
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
A base R option using match + na.omit + unique
transform(
df,
uniqueID = match(col1, na.omit(unique(col1)))
)
gives
col1 uniqueID
1 A 1
2 B 2
3 <NA> NA
4 C 3
5 D 4
6 <NA> NA
7 <NA> NA
8 E 5
9 <NA> NA
10 F 6
A weird tidyverse solution:
library(dplyr)
df %>%
mutate(id = ifelse(is.na(col1), 0, 1),
id = cumsum(id == 1),
id = ifelse(is.na(col1), NA, id))
# A tibble: 10 x 2
col1 id
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6

Update a variable if dplyr filter conditions are met

With the command df %>% filter(is.na(df)[,2:4]) filter function subset in a new df that has rows with NA's in columns 2, 3 and 4. What I want is not a new subsetted df but rather assign in example "1" to a new variable called "Exclude" in the actual df.
This example with mutate was not exactly what I was looking for, but close:
Use dplyr´s filter and mutate to generate a new variable
Also I would need the same to happen with other filter conditions.
Example I have the following:
df <- data.frame(A = 1:6, B = 11:16, C = 21:26, D = 31:36)
df[3,2:4] <- NA
df[5,2:4] <- NA
df
> df
A B C D
1 1 11 21 31
2 2 12 22 32
3 3 NA NA NA
4 4 14 24 34
5 5 NA NA NA
6 6 16 26 36
and would like
> df
A B C D Exclude
1 1 11 21 31 NA
2 2 12 22 32 NA
3 3 NA NA NA 1
4 4 14 24 34 NA
5 5 NA NA NA 1
6 6 16 26 36 NA
Any good ideas how the filter subset could be used to update easy? The hard way work around would be to generate this subset, create new variable for all and then join back but that is not tidy code.
We can do this with base R using vectorized rowSums
df$Exclude <- NA^!rowSums(is.na(df[-1]))
-output
df
# A B C D Exclude
#1 1 11 21 31 NA
#2 2 12 22 32 NA
#3 3 NA NA NA 1
#4 4 14 24 34 NA
#5 5 NA NA NA 1
#6 6 16 26 36 NA
Does this work:
library(dplyr)
df %>% rowwise() %>%
mutate(Exclude = +any(is.na(c_across(everything()))), Exclude = na_if(Exclude, 0))
# A tibble: 6 x 5
# Rowwise:
A B C D Exclude
<int> <int> <int> <int> <int>
1 1 11 21 31 NA
2 2 12 22 32 NA
3 3 NA NA NA 1
4 4 14 24 34 NA
5 5 NA NA NA 1
6 6 16 26 36 NA
Using anyNA.
df %>% mutate(Exclude=ifelse(apply(df[2:4], 1, anyNA), 1, NA))
# A B C D Exclude
# 1 1 11 21 31 NA
# 2 2 12 22 32 NA
# 3 3 NA NA NA 1
# 4 4 14 24 34 NA
# 5 5 NA NA NA 1
# 6 6 16 26 36 NA
Or just
df$Exclude <- ifelse(apply(df[2:4], 1, anyNA), 1, NA)
Another one-line solution:
df$Exclude <- as.numeric(apply(df[2:4], 1, function(x) any(is.na(x))))
Use rowwise, sum over all numeric columns, assign 1 or NA in ifelse.
df <- data.frame(A = 1:6, B = 11:16, C = 21:26, D = 31:36)
df[3, 2:4] <- NA
df[5, 2:4] <- NA
library(tidyverse)
df %>%
rowwise() %>%
mutate(Exclude = ifelse(
is.na(sum(c_across(where(is.numeric)))), 1, NA
))
#> # A tibble: 6 x 5
#> # Rowwise:
#> A B C D Exclude
#> <int> <int> <int> <int> <dbl>
#> 1 1 11 21 31 NA
#> 2 2 12 22 32 NA
#> 3 3 NA NA NA 1
#> 4 4 14 24 34 NA
#> 5 5 NA NA NA 1
#> 6 6 16 26 36 NA

how to use R to transfer hourly passenger OD data to od matrix

I'm trying to transfer hourly passenger OD data to OD matrix.
My current dataframe looks something like this:
Hour Ostation Dstation Passengers
8 A B 2
8 A C 3
8 A D 4
8 B C 5
8 B D 6
8 C D 1
10 A B 4
10 A C 5
10 A D 6
10 B C 1
10 B D 2
10 C D 3
And I'd like for HOUR = 8:
A B C D
A
B 2
C 3 5
D 4 6 1
And HOUR = 10:
A B C D
A
B 4
C 5 1
D 6 2 3
I use split and table:
ODdata$Ostation <- factor(ODdata$Ostation)
ODdata$Dstation <- factor(ODdata$Dstation)
ODtable <-lapply(split(ODdata, ODdata$Hour),
function(x) table(x$Ostation, x$Dstation))
I can get the OD matrix, but the value is counts, not Passengers.
You should use dcast from reshape2 in place of table. Set drop = F to keep all factor levels in the output matrices.
library(reshape2)
ODtable <-lapply(split(ODdata, ODdata$Hour),
function(x) dcast(Dstation ~ Ostation , data = x,
value.var = "Passengers", drop = FALSE))
ODtable
#$`8`
# Dstation A B C D
#1 A NA NA NA NA
#2 B 2 NA NA NA
#3 C 3 5 NA NA
#4 D 4 6 1 NA
#
#$`10`
# Dstation A B C D
#1 A NA NA NA NA
#2 B 4 NA NA NA
#3 C 5 1 NA NA
#4 D 6 2 3 NA
sample data
ODdata <- data.frame(Hour = c(rep(8,6), rep(10,6)),
Ostation = factor(c("A","A","A","B","B","C","A","A","A","B","B","C"),
levels = c('A', 'B', 'C', 'D')),
Dstation = factor(c("B","C","D","C","D","D","B","C","D","C","D","D"),
levels = c('A', 'B', 'C', 'D')),
Passengers = c(2,3,4,5,6,1,4,5,6,1,2,3))

Insert a row of NAs after each group of data using data.table

I am trying to add a row of NAs after each group of data in R.
A similar question has been asked earlier. Insert a blank row after each group of data.
The accepted answer works fine in this case too as follows.
group <- c("a","b","b","c","c","c","d","d","d","d")
xvalue <- c(16:25)
yvalue <- c(1:10)
df <- data.frame(cbind(group,xvalue,yvalue))
df_new <- as.data.frame(lapply(df, as.character), stringsAsFactors = FALSE)
head(do.call(rbind, by(df_new, df$group, rbind, NA)), -1 )
group xvalue yvalue
a.1 a 16 1
a.2 <NA> <NA> <NA>
b.2 b 17 2
b.3 b 18 3
b.31 <NA> <NA> <NA>
c.4 c 19 4
c.5 c 20 5
c.6 c 21 6
c.41 <NA> <NA> <NA>
d.7 d 22 7
d.8 d 23 8
d.9 d 24 9
d.10 d 25 10
How can I speed this up using data.table for a large data.frame?
You could try
df$group <- as.character(df$group)
setDT(df)[, .SD[1:(.N+1)], by=group][is.na(xvalue), group:=NA][!.N]
# group xvalue yvalue
#1: a 16 1
#2: NA NA NA
#3: b 17 2
#4: b 18 3
#5: NA NA NA
#6: c 19 4
#7: c 20 5
#8: c 21 6
#9: NA NA NA
#10: d 22 7
#11: d 23 8
#12: d 24 9
#13: d 25 10
Or as suggested by #David Arenburg
setDT(df)[, indx := group][, .SD[1:(.N+1)], indx][,indx := NULL][!.N]
Or
setDT(df)[df[,.I[1:(.N+1)], group]$V1][!.N]
Or it could be further simplified based on #eddi's comments
setDT(df)[df[, c(.I, NA), group]$V1][!.N]
One way I could think of is to construct a vector first as follows:
foo <- function(x) {
o = order(rep.int(seq_along(x), 2L))
c(x, rep.int(NA, length(x)))[o]
}
join_values = head(foo(unique(df_new$group)), -1L)
# [1] "a" NA "b" NA "c" NA "d"
And then setkey() and join.
setkey(setDT(df_new), group)
df_new[.(join_values), allow.cartesian=TRUE]
# group xvalue yvalue
# 1: a 16 1
# 2: NA NA NA
# 3: b 17 2
# 4: b 18 3
# 5: NA NA NA
# 6: c 19 4
# 7: c 20 5
# 8: c 21 6
# 9: NA NA NA
# 10: d 22 7
# 11: d 23 8
# 12: d 24 9
# 13: d 25 10

Resources