how to split a dataframe by specific rows in r - r

I have a data look like this:
data <- structure(list(A = c("1", "1", "1", "A", "10", "10", "B", "200"), B = c("2", "2", "2", "B", "20", "20", "C", "300"), C = c("3","3", "3", "C", "30", "30", "D", "400"), D = c("4", "4", "4", "D", "40", "40", NA, NA)), row.names = c(NA, -8L), class = c("tbl_df","tbl", "data.frame"))
data
> data
# A tibble: 8 x 4
A B C D
<chr> <chr> <chr> <chr>
1 1 2 3 4
2 1 2 3 4
3 1 2 3 4
4 A B C D
5 10 20 30 40
6 10 20 30 40
7 B C D NA
8 200 300 400 NA
It was wrong bind by rows and I wanted to split the data into 3 sub data(d1, d2 and d3) such like this:
NOTE: In my real situation, d1, d2 and d3 have different nrow(). I set nrow(d1) = 3, nrow(d2) = 2 and nrow(d3) = 1 just for simplify the question in this example.
d1 <- data.frame(A = rep(1,3), B = rep(2,3), C = rep(3,3), D = rep(4,3))
d2 <- data.frame(A = rep(10,2), B = rep(20,2), C = rep(30,2), D = rep(40,2))
d3 <- data.frame( B = 200, C = 300, D = 400)
> d1
A B C D
1 1 2 3 4
2 1 2 3 4
3 1 2 3 4
> d2
A B C D
1 10 20 30 40
2 10 20 30 40
> d3
B C D
1 200 300 400
And then I could bind them correctly using bind_rows from dplyr
bind_rows(d1, d2, d3) %>% as_tibble()
# A tibble: 6 x 4
A B C D
<dbl> <dbl> <dbl> <dbl>
1 1 2 3 4
2 1 2 3 4
3 1 2 3 4
4 10 20 30 40
5 10 20 30 40
6 NA 200 300 400
The problem is that I am troubled by how to get the d1, d2 and d3 from data.
Any help will be highly appreciated!

Here is a tidyverse solution.
process_df takes a data frame and sets the column names and removes the first row.
process_df <- function(df, ...) {
df %>%
set_names(slice(., 1)) %>%
select(which(!is.na(names(.)))) %>%
slice(-1)
}
Add a header row that just contains the column names.
Use rowwise() and c_across() to get the values of all columns by row. Use this to identify which rows are header rows.
group_map will apply a function over each group and bind_rows will combine the results.
data %>%
add_row(!!!set_names(names(.)), .before = 1) %>%
rowwise() %>%
mutate(
group = all(is.na(c_across()) | c_across() %in% names(.))
) %>%
ungroup() %>%
mutate(group = cumsum(group)) %>%
group_by(group) %>%
group_map(process_df) %>%
bind_rows()
#> # A tibble: 6 x 4
#> A B C D
#> <chr> <chr> <chr> <chr>
#> 1 1 2 3 4
#> 2 1 2 3 4
#> 3 1 2 3 4
#> 4 10 20 30 40
#> 5 10 20 30 40
#> 6 NA 200 300 400
Explanation of the usage of !!! in new_row
set_names(names(.)) creates a named vector that represents the row we want to add. However, add_row doesn't accept a named vector - it wants the values to be specified as arguments.
Here is a simplified example.
new_row <- c(speed = 1, dist = 2)
add_row doesn't accept a named vector, so this doesn't work.
cars %>% add_row(new_row, .before = TRUE)
# (Error)
!!! will unpack the vector as arguments to the function.
cars %>% add_row(!!!new_row, .before = TRUE)
# (Works)
!!! above essentially results in this:
cars %>% add_row(speed = 1, dist = 2, .before = TRUE)

Does this work:
data
# A tibble: 5 x 4
A B C D
<chr> <chr> <chr> <chr>
1 1 2 3 4
2 A B C D
3 10 20 30 40
4 B C D NA
5 200 300 400 NA
data <- rbind(LETTERS[1:4],data)
data
# A tibble: 6 x 4
A B C D
<chr> <chr> <chr> <chr>
1 A B C D
2 1 2 3 4
3 A B C D
4 10 20 30 40
5 B C D NA
6 200 300 400 NA
split(data, rep(1:ceiling(nrow(data)/2), each = 2))
$`1`
# A tibble: 2 x 4
A B C D
<chr> <chr> <chr> <chr>
1 A B C D
2 1 2 3 4
$`2`
# A tibble: 2 x 4
A B C D
<chr> <chr> <chr> <chr>
1 A B C D
2 10 20 30 40
$`3`
# A tibble: 2 x 4
A B C D
<chr> <chr> <chr> <chr>
1 B C D NA
2 200 300 400 NA

Base R solution:
Map(function(x){setNames(data.frame(t(x[,2, drop = FALSE])), x[,1])[,!is.na(x[,1])]},
split.default(cbind(X0 = names(df), data.frame(t(df))), c(0, seq_len(nrow(df)) %/% 2)))
Including pushing separate data.frames to Global Environment:
list2env(setNames(Map(function(x){setNames(data.frame(t(x[,2, drop = FALSE])), x[,1])[,!is.na(x[,1])]},
split.default(cbind(X0 = names(df), data.frame(t(df))), c(0, seq_len(nrow(df)) %/% 2))),
paste0('d', seq_len(ceiling(nrow(df) / 2)))), .GlobalEnv)
Tidyverse Solution:
library(tidyverse)
df %>%
rbind(names(df), .) %>%
split(cumsum(seq_len(nrow(.)) %% 2)) %>%
Map(function(x){setNames(x[2,], x[1,])[,complete.cases(t(x))]}, .) %>%
set_names(str_c('d', names(.))) %>%
list2env(., .GlobalEnv)
Note solution adjusted to reflect edit to the question:
rdf <- type.convert(data.frame(t(rbind(names(df), df))))
Map(function(x){
y <- setNames(t(x[,-1, drop = FALSE]), x[,1]); y[,!is.na(colSums(y))]
}, split.default(rdf, cumsum(!sapply(rdf, is.integer))))
New solution including push to Global Env:
rdf <- type.convert(data.frame(t(rbind(names(df), df))))
dflist <- Map(function(x) {
y <-
setNames(t(x[, -1, drop = FALSE]), x[, 1])
y[, !is.na(colSums(y))]
}, split.default(rdf, cumsum(!sapply(rdf, is.integer))))
list2env(setNames(dflist, paste0('d', names(dflist))), .GlobalEnv)
Adjusted Tidyverse solution:
df %>%
rbind(names(.), .) %>%
t() %>%
data.frame() %>%
type.convert() %>%
split.default(cumsum(!sapply(., is.integer))) %>%
Map(function(x){
y <- setNames(t(x[,-1, drop = FALSE]), x[,1])
data.frame(y[,!is.na(colSums(y)), drop = FALSE])}, .) %>%
set_names(str_c('d', names(.))) %>%
list2env(., .GlobalEnv)
Data:
df <- structure(list(A = c("1", "A", "10", "B", "200"), B = c("2", "B", "20", "C", "300"), C = c("3", "C", "30", "D", "400"), D = c("4","D", "40", NA, NA)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
Updated Data:
df <- structure(list(A = c("1", "1", "1", "A", "10", "10", "B", "200"), B = c("2", "2", "2", "B", "20", "20", "C", "300"), C = c("3","3", "3", "C", "30", "30", "D", "400"), D = c("4", "4", "4", "D", "40", "40", NA, NA)), row.names = c(NA, -8L), class = c("tbl_df","tbl", "data.frame"))

Related

Replacing NA values with the next value in a column in R

I'm trying to mutate a column in a Dataframe using the lag() function as a condition without producing NA values. Let me create an example:
df <- data.frame("Score" = as.numeric(c("20", "10", "15", "30", "15", "10")),
"Time" = c("1", "2", "1", "2", "1", "2"),
"Team" = c("A", "A", "B", "B", "C", "C"))
After that, I created a new column named Diff that calculates the difference of the Score of every Team:
df <- df %>%
group_by(Team) %>%
mutate(Diff = Score - lag(Score))
My problem is that this method creates NA values, obviously:
Score Time Team Diff
20 1 A NA
10 2 A -10
15 1 B NA
30 2 B 15
15 1 C NA
10 2 C -5
My goal is to have this at the end:
Score Time Team Diff
20 1 A -10
10 2 A -10
15 1 B 15
30 2 B 15
15 1 C -5
10 2 C -5
I've tried mutating again using the case_when() function to substitute the NA for the next value, but it also didn't work:
df %>%
group_by(Team) %>%
mutate(Diff = Score - lag(Score)) %>%
mutate(Diff = case_when(
NA ~ lead(Diff)
))
Anyway, how do I make the NA values be replaced by the next Diff value?
Thanks a lot!
Just use fill() after the fact:
library(tidyverse)
df <- data.frame("Score" = as.numeric(c("20", "10", "15", "30", "15", "10")),
"Time" = c("1", "2", "1", "2", "1", "2"),
"Team" = c("A", "A", "B", "B", "C", "C"))
df <- df %>%
group_by(Team) %>%
mutate(Diff = Score - lag(Score)) %>%
fill(Diff, .direction = 'up')
df
# output
# Score Time Team Diff
# <dbl> <chr> <chr> <dbl>
#1 20 1 A -10
#2 10 2 A -10
#3 15 1 B 15
#4 30 2 B 15
#5 15 1 C -5
#6 10 2 C -5

How i can calculate the correlation between groups in R using dplyr?

Let's say i have data frame in R that looks like this :
var = c(rep("A",3),rep("B",3),rep("C",3),rep("D",3),rep("E",3))
y = rnorm(15)
data = tibble(var,y);data
With output:
# A tibble: 15 x 2
var y
<chr> <dbl>
1 A -1.23
2 A -0.983
3 A 1.28
4 B -0.268
5 B -0.460
6 B -1.23
7 C 1.87
8 C 0.416
9 C -1.99
10 D 0.289
11 D 1.70
12 D -0.455
13 E -0.648
14 E 0.376
15 E -0.887
i want to calculate the correlation of each distinct pair in R using dplyr.
Ideally i want to look like this (the third column to contain the values of each correlation pair):
var1
var2
value
A
B
cor(A,B)
A
C
cor(A,C)
A
D
cor(A,D)
A
E
cor(A,E)
B
C
cor(B,E)
B
D
cor(B,E)
B
E
cor(B,E)
C
D
cor(C,E)
C
E
cor(C,E)
D
E
cor(D,E)
How i can do that in R ?
Any help ?
Additional
if i have another grouping variable say group2:
var2 = c(rep("A",3),rep("B",3),rep("C",3),rep("D",3),rep("E",3),rep("F",3),
rep("H",3),rep("I",3))
y2 = rnorm(24)
group2 = c(rep(1,6),rep(2,6),rep(3,6),rep(1,6))
data2 = tibble(var2,group2,y2);data2
which ideally must look like this :
group
var1
var2
value
1
A
B
cor(A,B)
1
A
H
cor(A,H)
1
A
I
cor(A,I)
1
B
H
cor(B,H)
1
B
I
cor(B,I)
1
H
I
cor(H,I)
2
C
D
cor(C,D)
3
E
F
cor(E,F)
How i can calculate each variable in column var2 on each group group2?
Here is a one-liner via base R
data.frame(t(combn(unique(data$var), 2, function(i)
list(v1 = i[[1]],
v2 = i[[2]],
value = cor(data$y[data$var %in% i[[1]]],
data$y[data$var %in% i[[2]]])))))
X1 X2 X3
1 A B 0.997249
2 A C 0.7544987
3 A D -0.7924587
4 A E 0.03567887
5 B C 0.8010711
6 B D -0.7450683
7 B E 0.1096579
8 C D -0.1976141
9 C E 0.6828033
10 D E 0.5812632
Another possible solution:
library(tidyverse)
df %>%
group_by(var) %>%
group_map(~ data.frame(.x) %>% set_names(.y)) %>%
bind_cols %>% cor %>%
{data.frame(row=rownames(.)[row(.)[upper.tri(.)]],
col=colnames(.)[col(.)[upper.tri(.)]],
corr=.[upper.tri(.)])}
#> row col corr
#> 1 A B -0.9949738
#> 2 A C -0.9574502
#> 3 B C 0.9815368
#> 4 A D -0.7039708
#> 5 B D 0.6293137
#> 6 C D 0.4690460
#> 7 A E -0.5755463
#> 8 B E 0.4907660
#> 9 C E 0.3150499
#> 10 D E 0.9859711
1) Add an index column 1, 2, 3, 1, 2, 3, ... and then use read.zoo to convert from long to wide. Take the correlation reshape back to long form using as.data.frame.table and filter out the desired rows.
library(dplyr)
library(zoo)
DF %>%
mutate(index = sequence(rle(var)$lengths)) %>%
read.zoo(index = "index", split = "var") %>%
cor %>%
as.data.frame.table(responseName = "cor") %>%
filter(format(Var1) < format(Var2))
2) At the expense of one more line of code we can substitute pivot_wider for read.zoo.
library(dplyr)
library(tidyr)
DF %>%
mutate(index = sequence(rle(var)$lengths)) %>%
pivot_wider(index, names_from = "var", values_from = "y") %>%
select(-index) %>%
cor %>%
as.data.frame.table(responseName = "cor") %>%
filter(format(Var1) < format(Var2))
3) A base solution consists of using combn to get the pairs of var with the indicated function f.
co <- combn(unique(DF$var), 2)
f <- function(v) with(DF, data.frame(t(v), cor = cor(y[var==v[1]], y[var==v[2]])))
do.call("rbind", apply(co, 2, f))
4) We can use list comprehensions via several alternative packages.
library(listcompr)
with(DF,
gen.data.frame(list(x_1, x_2, cor = cor(y[var == x_1], y[var == x_2])),
x_ = unique(var), x_1 < x_2)
)
or
library(comprehenr)
with(DF, {
uv <- unique(var)
to_df(for(u in uv) for(v in uv) if (u < v)
data.frame(u, v, cor = cor(y[var == u], y[var == v])))
})
Note
The input in reproducible form.
DF <-
structure(list(var = c("A", "A", "A", "B", "B", "B", "C", "C",
"C", "D", "D", "D", "E", "E", "E"), y = c(-1.23, -0.983, 1.28,
-0.268, -0.46, -1.23, 1.87, 0.416, -1.99, 0.289, 1.7, -0.455,
-0.648, 0.376, -0.887)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15"))

arrange a complicated data set in R

I have a large data set:
> ncol(d) [1] 1680 nrow(d) [1] 12
that it looks like this:
a b c e f g
3 2 5 1 3 6
a b c d e g
1 7 8 4 5 8
a c d e f h #in this row b does not exist
5 10 4 7 5 10
And I need that it looks like this:
a b c d e f g h
3 2 5 0 3 6 10 8
1 7 8 4 5 0 8 0
5 0 10 4 7 5 0 10 #and all the other columns ...
Since my data is really long and I have many corrections like this one to do over all the data set, it is hard to do it by hand. I would like to know if there is any way to do this using some sort of automatic way, like a logic function or a loop.
Any idea is welcome
Regards
Here's a possible approach using data.table:
library(data.table)
melt(
setDT(
setnames(
data.table::transpose(df1),
paste(rep(1:(nrow(df1)/2), each = 2), c("name", "value"), sep = "_"))),
measure = patterns("name", "value"))[
, dcast(.SD, variable ~ value1, value.var = "value2", fill = 0)]
# variable a b c d e f g h
# 1: 1 3 2 5 0 1 3 6 0
# 2: 2 1 7 8 4 5 0 8 0
# 3: 3 5 0 10 4 7 5 0 10
We could get the alternate rows with recycling logical vector, construct a data.frame and pivot it to wide format with pivot_wider
library(dplyr)
library(tidyr)
library(data.table)
sub1 <- df1[c(TRUE, FALSE),]
sub2 <- df1[c(FALSE, TRUE),]
tibble(ind = c(row(sub1)), col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))) %>%
pivot_wider(names_from = col1, values_from = col2,
values_fill = list(col2 = 0)) %>%
select(-ind)
#A tibble: 3 x 8
# a b c d e f g h
# <int> <int> <int> <int> <int> <int> <int> <int>
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
Or using base R with reshape
out <- reshape(
data.frame(ind = c(row(sub1)),
col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))),
idvar = 'ind', direction = 'wide', timevar = 'col1')[-1]
names(out) <- sub("col2\\.", "", names(out))
out[is.na(out)] <- 0
row.names(out) <- NULL
out
# a b c d e f g h
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
data
df1 <- structure(list(v1 = c("a", "3", "a", "1", "a", "5"), v2 = c("b",
"2", "b", "7", "c", "10"), v3 = c("c", "5", "c", "8", "d", "4"
), v4 = c("e", "1", "d", "4", "e", "7"), v5 = c("f", "3", "e",
"5", "f", "5"), v6 = c("g", "6", "g", "8", "h", "10")), class = "data.frame",
row.names = c(NA,
-6L))

how can I regroup multiple categorical variables into a new variable

I have a data.frame (df) with 2 columns (A, B):
A B
1 a TCRB
2 a TCRG
3 a TCRB
4 b TCRB
5 b TCRG
6 c TCRB
7 c TCRB
8 c TCRB
9 c TCRB
10 d TCRG
11 d TCRG
12 d TCRG
I want to create a new column "C" as bellow that tells me whether each unique variable in "A" has both TCRB and TCRG or either one of them (0= TCRB only, 1= TCRG only, 2= both) as follows:
A: a b c d
C: 2 2 0 1
Greatly appreciate any help!
Here's an approach with dplyr:
library(dplyr)
df %>%
group_by(A) %>%
dplyr::summarise(C = case_when("TCRB" %in% B & "TCRG" %in% B ~ 2,
"TCRB" %in% B ~ 0,
"TCRG" %in% B ~ 1,
TRUE ~ NA_real_))
# A tibble: 4 x 2
A C
<fct> <dbl>
1 a 2
2 b 2
3 c 0
4 d 1
An option with n_distinct
library(dplyr)
df %>%
group_by(A) %>%
summarise(C = n_distinct(B) *!all(B == 'TCRB'))
# A tibble: 4 x 2
# A C
# <chr> <int>
#1 a 2
#2 b 2
#3 c 0
#4 d 1
data
df <- structure(list(A = c("a", "a", "a", "b", "b", "c", "c", "c",
"c", "d", "d", "d"), B = c("TCRB", "TCRG", "TCRB", "TCRB", "TCRG",
"TCRB", "TCRB", "TCRB", "TCRB", "TCRG", "TCRG", "TCRG")),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
In Base R, we can use aggregate :
aggregate(B~A, df, function(x) {
if(all(c('TCRB', 'TCRG') %in% x)) 2
else if(any(x == 'TCRG')) 1
else if(any(x == 'TCRB')) 0
else NA
})
# A B
#1 a 2
#2 b 2
#3 c 0
#4 d 1

Making explicit implicit missing values in nested levels [duplicate]

This question already has an answer here:
How to complete missing factor levels in data frame?
(1 answer)
Closed 3 years ago.
I am trying to complete my dataframe with missing levels.
Current output
id foo bar val
1 a x 7
2 a y 9
3 a z 6
4 b x 10
5 b y 4
6 b z 5
7 c y 2
Data
structure(list(id = c("1", "2", "3", "4", "5", "6", "7"), foo = c("a",
"a", "a", "b", "b", "b", "c"), bar = c("x", "y", "z", "x", "y",
"z", "y"), val = c("7", "9", "6", "10", "4", "5", "2")), .Names = c("id",
"foo", "bar", "val"), row.names = c(NA, -7L), class = "data.frame")
I would like to make explicit the missing nested levels of c with 0s for x and z. I could find a workaround with expand.grid but could not manage to obtain the desired output with tidyr.
Desired output :
id foo bar val
1 a x 7
2 a y 9
3 a z 6
4 b x 10
5 b y 4
6 b z 5
7 c x 0
8 c y 2
9 c z 0
Thanks in advance!
Given that you are looking for a tidyr solution, you should check out tidyr::complete (which does exactly what you are after):
library(tidyverse)
complete(df, foo, bar, fill = list(val = 0)) %>% select(-id)
#> # A tibble: 9 x 3
#> foo bar val
#> <chr> <chr> <chr>
#> 1 a x 7
#> 2 a y 9
#> 3 a z 6
#> 4 b x 10
#> 5 b y 4
#> 6 b z 5
#> 7 c x 0
#> 8 c y 2
#> 9 c z 0

Resources