Selected columns to new row - r

I'm trying to split columns into new rows keeping the data of the first two columns.
d1 <- data.frame(a=c(100,0,78),b=c(0,137,117),c.1=c(111,17,91), d.1=c(99,66,22), c.2=c(11,33,44), d.2=c(000,001,002))
d1
a b c.1 d.1 c.2 d.2
1 100 0 111 99 11 0
2 0 137 17 66 33 1
3 78 117 91 22 44 2
Expected results would be:
a b c d
1 100 0 111 99
2 100 0 11 0
3 0 137 17 66
4 0 137 33 1
5 78 117 91 22
6 78 117 44 2
Multiple tries with dplyr, but in sees is not the right approach.

If you want to stay in dplyr/tidyverse, you want tidyr::pivot_longer with a special reference to .value -- see the pivot vignette for more:
library(tidyverse)
d1 <- data.frame(
a = c(100, 0, 78),
b = c(0, 137, 117),
c.1 = c(111, 17, 91),
d.1 = c(99, 66, 22),
c.2 = c(11, 33, 44),
d.2 = c(000, 001, 002)
)
d1 %>%
pivot_longer(
cols = contains("."),
names_to = c(".value", "group"),
names_sep = "\\."
)
#> # A tibble: 6 x 5
#> a b group c d
#> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 100 0 1 111 99
#> 2 100 0 2 11 0
#> 3 0 137 1 17 66
#> 4 0 137 2 33 1
#> 5 78 117 1 91 22
#> 6 78 117 2 44 2
Created on 2020-05-11 by the reprex package (v0.3.0)

This could solve your issue:
#Try this
a1 <- d1[,c(1:4)]
a2 <- d1[,c(1,2,5,6)]
names(a1) <- names(a2) <- c('a','b','c','d')
DF <- rbind(a1,a2)

The posted answers are good, here's my attempt:
df <- data.frame(a=c(100,0,78),b=c(0,137,117),
c.1=c(111,17,91), d.1=c(99,66,22),
c.2=c(11,33,44), d.2=c(000,001,002))
# Make 2 pivot long operations
df_c <- df %>% select(-d.1, -d.2) %>%
pivot_longer(cols = c("c.1", "c.2"), values_to = "c") %>% select(-name)
df_d <- df %>% select(-c.1, -c.2) %>%
pivot_longer(cols=c("d.1","d.2"), values_to = "d") %>% select(-name)
# bind them without the "key" colums
bind_cols(df_c, select(df_d, -a, -b))
Which produces
# A tibble: 6 x 4
a b c d
<dbl> <dbl> <dbl> <dbl>
1 100 0 111 99
2 100 0 11 0
3 0 137 17 66
4 0 137 33 1
5 78 117 91 22
6 78 117 44 2

Related

Is there a better R way to expand a dataframe by a function on rows?

Question:
Below works, but is there a better "R way" of achieving similar result? I am essentially trying to create / distribute groups into individual line items according to a user defined function (currently just using a loop).
Example:
df1 <- data.frame(group = c("A", "B", "C"),
volume = c(200L, 45L, 104L)
)
print(df1)
#> group volume
#> 1 A 200
#> 2 B 45
#> 3 C 104
I want the volume to be broken across multiple rows according to group so that the final result is a dataframe where the new volume (vol2 in the below) would add up to original volume above. In this example, I'm applying integer math with a divisor of 52, so my final result should be:
print(df3)
#> group vol2
#> 1 A 52
#> 2 A 52
#> 3 A 52
#> 4 A 44
#> 21 B 45
#> 31 C 52
#> 32 C 52
This works
The code below DOES get me to the desired result shown above:
div <- 52L
df1$intgr <- df1$volume %/% div
df1$remainder <- df1$volume %% div
print(df1)
#> group volume intgr remainder
#> 1 A 200 3 44
#> 2 B 45 0 45
#> 3 C 104 2 0
df2 <- data.frame()
for (r in 1:nrow(df1)){
if(df1[r,"intgr"] > 0){
for (k in 1:as.integer(df1[r,"intgr"])){
df1[r,"vol2"] <- div
df2 <- rbind(df2, df1[r,])
}
}
if(df1[r,"remainder"]>0){
df1[r, "vol2"] <- as.integer(df1[r, "remainder"])
df2 <- rbind(df2, df1[r,])
}
}
print(df2)
#> group volume intgr remainder vol2
#> 1 A 200 3 44 52
#> 2 A 200 3 44 52
#> 3 A 200 3 44 52
#> 4 A 200 3 44 44
#> 21 B 45 0 45 45
#> 31 C 104 2 0 52
#> 32 C 104 2 0 52
df3 <- subset(df2, select = c("group", "vol2"))
print(df3)
#> group vol2
#> 1 A 52
#> 2 A 52
#> 3 A 52
#> 4 A 44
#> 21 B 45
#> 31 C 52
#> 32 C 52
Being still relatively new to R, I'm just curious if someone knows a better way / function / method that gets to the same place. Seems like there might be. I could potentially have a more complex way of breaking up the rows and I was thinking maybe there's a method that applies a UDF to the dataframe to do something like this. I was searching for "expand group/groups" but was finding mostly "expand.grid" which isn't what I'm doing here.
Thank you for any suggestions!
A quick function to help split each number by the modulus,
fun <- function(num, mod) c(rep(mod, floor(num / mod)), (num-1) %% mod + 1)
fun(200, 52)
# [1] 52 52 52 44
fun(45, 52)
# [1] 45
fun(104, 52)
# [1] 52 52
And we can apply this a number of ways:
dplyr
library(dplyr)
df1 %>%
group_by(group) %>%
summarize(vol2 = fun(volume, 52), .groups = "drop")
# # A tibble: 7 x 2
# group vol2
# <chr> <dbl>
# 1 A 52
# 2 A 52
# 3 A 52
# 4 A 44
# 5 B 45
# 6 C 52
# 7 C 52
base R
do.call(rbind, by(df1, seq(nrow(df1)),
FUN = function(z) data.frame(group = z$group, vol2 = fun(z$volume, 52))))
data.table
library(data.table)
setDT(df1)
df1[, .(vol2 = fun(volume, 52)), by = group]
A tidyverse approach using purrr::pmap and tidyr::unnest_longer may look like so:
library(dplyr, w = FALSE)
library(tidyr)
library(purrr)
div <- 52
df1 |>
mutate(intgr = volume %/% div, remainder = volume %% div, intgr1 = +(remainder > 0)) |>
mutate(vol2 = purrr::pmap(list(intgr, intgr1, remainder), ~ c(rep(div, ..1), rep(..3, ..2)))) |>
tidyr::unnest_longer(vol2) |>
select(-intgr1)
#> # A tibble: 7 × 5
#> group volume intgr remainder vol2
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 A 200 3 44 52
#> 2 A 200 3 44 52
#> 3 A 200 3 44 52
#> 4 A 200 3 44 44
#> 5 B 45 0 45 45
#> 6 C 104 2 0 52
#> 7 C 104 2 0 52
With data.table and rep:
library(data.table)
setDT(df1)[, .(vol2 = c(rep(52, volume%/%52), (volume%%52)[sign(volume%%52)])), group]
#> group vol2
#> 1: A 52
#> 2: A 52
#> 3: A 52
#> 4: A 44
#> 5: B 45
#> 6: C 52
#> 7: C 52
Or
setDT(df1)[, .(vol2 = c(rep(52, volume%/%52), volume%%52)), group][vol2 != 0]
#> group vol2
#> 1: A 52
#> 2: A 52
#> 3: A 52
#> 4: A 44
#> 5: B 45
#> 6: C 52
#> 7: C 52
Vectorised and without grouping:
df1 <- data.frame(group = c("A", "B", "C"),
volume = c(200L, 45L, 104L))
n <- 52
idx <- df1$volume %/% n + ((sel <- df1$volume %% n) != 0)
out <- df1[rep(seq_len(nrow(df1)), idx),]
out$volume <- n
out$volume[cumsum(idx)[sel != 0]] <- sel[sel != 0]
## group volume
##1 A 52
##1.1 A 52
##1.2 A 52
##1.3 A 44
##2 B 45
##3 C 52
##3.1 C 52
Another base R solution using aggregate :
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1))
group volume
1 A 52, 52, 52, 44
2 B 45
3 C 52, 52, 52
This results in a list column for volume (could be useful)
To transform it to a long dataframe we can either use stack:
with(
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1)),
setNames(stack(setNames(volume,group))[2:1],names(df1))
)
group volume
1 A 52
2 A 52
3 A 52
4 A 44
5 B 45
6 C 52
7 C 52
8 C 52
Or alternatively use unnest from tidyr
library(tidyr)
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1)) %>% unnest(volume)
# A tibble: 8 × 2
group volume
<chr> <dbl>
1 A 52
2 A 52
3 A 52
4 A 44
5 B 45
6 C 52
7 C 52
8 C 52

How to transform column variables at various time to long format?

I have the following dataset df.
For each id, a1-a3 are the values of variable a recorded at time points 1-3. b1-b3 are the values of variable b recorded at time 1-3. c is a time-invariant variable.
Here is the codes to create the dataset:
id <- c(1, 2, 3)
a1 <- c(52, 339, 83)
a2 <- c(86, 746, 35)
a3 <- c(46, 546, 45)
b1 <- c(84, 45, 83)
b2 <- c(55, 46, 35)
b3 <- c(46, 60, 45)
c <- c(30, 20, 50)
df <- cbind(id, a1, a2, a3, b1, b2, b3, c)
Here is original dataset df
id a1 a2 a3 b1 b2 b3 c
[1,] 1 52 86 46 84 55 46 30
[2,] 2 339 746 546 45 46 60 20
[3,] 3 83 35 45 83 35 45 50
I want to change it to the long format, i.e., into the following df2
time id a b c
[1,] 1 1 52 84 30
[2,] 2 1 86 55 30
[3,] 3 1 46 46 30
[4,] 1 2 339 45 20
[5,] 2 2 746 46 20
[6,] 3 2 546 60 20
[7,] 1 3 83 83 50
[8,] 2 3 35 35 50
[9,] 3 3 45 45 50
What is the best way to do that?
I tried pivot_longer Function (tidyr Package), but it does not return what I need.
Thank you very much for the help!
Here's a way to do it with a more advanced use of pivot_longer. A little harder to learn, but much less code:
df %>%
as.data.frame %>%
pivot_longer(-c(id, c), names_to = c('.value', 'time'), names_pattern = '(.)(.)') %>%
relocate(c, .after = b)
id time a b c
<dbl> <chr> <dbl> <dbl> <dbl>
1 1 1 52 84 30
2 1 2 86 55 30
3 1 3 46 46 30
4 2 1 339 45 20
5 2 2 746 46 20
6 2 3 546 60 20
7 3 1 83 83 50
8 3 2 35 35 50
9 3 3 45 45 50
Or, if you wanted to be a little more explicit about how the "time" and "c" columns are treated:
df %>%
as.data.frame %>%
pivot_longer(-id, names_to = c('.value', 'time'), names_pattern = '(.)(.*)') %>%
group_by(id) %>%
mutate(
time = as.numeric(time),
c = c[!is.na(c)]
) %>%
filter(!is.na(time))
id time a b c
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 52 84 30
2 1 2 86 55 30
3 1 3 46 46 30
4 2 1 339 45 20
5 2 2 746 46 20
6 2 3 546 60 20
7 3 1 83 83 50
8 3 2 35 35 50
9 3 3 45 45 50
Here is a way. After reshaping to long format, remove the digits in the name column, create a complementary id column, n, and reshape back to wide format.
id <- c(1, 2, 3)
a1 <- c(52, 339, 83)
a2 <- c(86, 746, 35)
a3 <- c(46, 546, 45)
b1 <- c(84, 45, 83)
b2 <- c(55, 46, 35)
b3 <- c(46, 60, 45)
c <- c(30, 20, 50)
df <- cbind(id, a1, a2, a3, b1, b2, b3, c)
df
#> id a1 a2 a3 b1 b2 b3 c
#> [1,] 1 52 86 46 84 55 46 30
#> [2,] 2 339 746 546 45 46 60 20
#> [3,] 3 83 35 45 83 35 45 50
suppressPackageStartupMessages({
library(dplyr)
library(tidyr)
})
df %>%
as.data.frame() %>%
pivot_longer(-id) %>%
mutate(name = gsub("\\d+", "", name)) %>%
group_by(id, name) %>%
mutate(n = row_number()) %>%
ungroup() %>%
pivot_wider(id_cols = c(id, n)) %>%
select(-n) %>%
mutate(c = zoo::na.locf(c))
#> # A tibble: 9 × 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 52 84 30
#> 2 1 86 55 30
#> 3 1 46 46 30
#> 4 2 339 45 20
#> 5 2 746 46 20
#> 6 2 546 60 20
#> 7 3 83 83 50
#> 8 3 35 35 50
#> 9 3 45 45 50
Created on 2022-10-23 with reprex v2.0.2

Transpose specific columns in a dataframe

How can I transpose specific columns in a data.frame as:
id<- c(1,2,3)
t0<- c(0,0,0)
bp0<- c(88,95,79)
t1<- c(15,12,12)
bp1<- c(92,110,82)
t2<- c(25,30,20)
bp2<- c(75,99,88)
df1<- data.frame(id, t0, bp0, t1, bp1, t2, bp2)
df1
> df1
id t0 bp0 t1 bp1 t2 bp2
1 1 0 88 15 92 25 75
2 2 0 95 12 110 30 99
3 3 0 79 12 82 20 88
In order to obtain:
> df2
id t bp
1 1 0 88
2 2 0 95
3 3 0 79
4 1 15 92
5 2 12 110
6 3 12 82
7 1 25 75
8 2 30 99
9 3 20 88
In order to obtain df2, with represent t(t0,t1,t2) and bp(bp0,bp1,bp2) for the corresponding "id"
Using Base R, you can do:
Reprex
Code
df2 <- cbind(df1[1], stack(df1, startsWith(names(df1), "t"))[1], stack(df1,startsWith(names(df1), "bp"))[1])
names(df2)[2:3] <- c("t", "bp")
Output
df2
#> id t bp
#> 1 1 0 88
#> 2 2 0 95
#> 3 3 0 79
#> 4 1 15 92
#> 5 2 12 110
#> 6 3 12 82
#> 7 1 25 75
#> 8 2 30 99
#> 9 3 20 88
Created on 2022-02-14 by the reprex package (v2.0.1)
Here is solution with pivot_longer using name_pattern:
\\w+ = one or more alphabetic charachters
\\d+ = one or more digits
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer (
-id,
names_to = c(".value", "name"),
names_pattern = "(\\w+)(\\d+)"
) %>%
select(-name)
id t bp
<dbl> <dbl> <dbl>
1 1 0 88
2 1 15 92
3 1 25 75
4 2 0 95
5 2 12 110
6 2 30 99
7 3 0 79
8 3 12 82
9 3 20 88
A base R option using reshape
reshape(
setNames(df1, sub("(\\d+)", ".\\1", names(df1))),
direction = "long",
idvar = "id",
varying = -1
)
gives
id time t bp
1.0 1 0 0 88
2.0 2 0 0 95
3.0 3 0 0 79
1.1 1 1 15 92
2.1 2 1 12 110
3.1 3 1 12 82
1.2 1 2 25 75
2.2 2 2 30 99
3.2 3 2 20 88

Choosing the right column based on a vector of column names

I'm trying to pull values from columns based on the values in a vector. I'm not sure I have the right words to describe the problem, but the code should help.
This feels related to coalesce maybe not?
library(tidyverse)
# Starting table
dat <-
tibble(
A = 1:10,
B = 31:40,
C = 101:110,
value = c("A", "C", "B", "A", "B", "C", "C", "B", "A", "A")
)
I want:
dat %>%
mutate(
output = c(1, 102, 33, 4, 35, 106, 107, 38, 9, 10)
)
I could do
dat %>%
mutate(
output =
case_when(value == "A" ~ A,
value == "B" ~ B,
value == "C" ~ C)
)
but my real application has many values and I want to take advantage of value having the matching info
Is there a function that does:
dat %>%
mutate(output = grab_the_right_column(value))
Thanks!
The rowwise approach would be less efficient, but it is compact within the tidyverse approaches to get the column value based on the column name for each row.
library(dplyr)
dat %>%
rowwise %>%
mutate(output = get(value)) %>%
ungroup
-output
# A tibble: 10 x 5
# A B C value output
# <int> <int> <int> <chr> <int>
# 1 1 31 101 A 1
# 2 2 32 102 C 102
# 3 3 33 103 B 33
# 4 4 34 104 A 4
# 5 5 35 105 B 35
# 6 6 36 106 C 106
# 7 7 37 107 C 107
# 8 8 38 108 B 38
# 9 9 39 109 A 9
#10 10 40 110 A 10
These type of issues are more efficient with a row/column indexing approach from base R. Create a matrix of row sequence and the matching index of columns with the 'value' column and the column names to extract the element
dat$output <- as.data.frame(dat)[,1:3][cbind(seq_len(nrow(dat)), match(dat$value, names(dat)[1:3]))]
You can also use purrr and pmap():
library(dplyr)
library(purrr)
dat%>%mutate(output=
pmap(., ~{
v1<-c(...)
v1[names(v1)==v1[['value']]]
}
)%>%
as.numeric()%>%
unlist)
# A tibble: 10 x 5
A B C value output
<int> <int> <int> <chr> <dbl>
1 1 31 101 A 1
2 2 32 102 C 102
3 3 33 103 B 33
4 4 34 104 A 4
5 5 35 105 B 35
6 6 36 106 C 106
7 7 37 107 C 107
8 8 38 108 B 38
9 9 39 109 A 9
10 10 40 110 A 10

Make column with "sample" for each row with purrr

I'm trying to make column with sample value for each row of data
But I'm new with purrr and can't make this.
My code
df<-data.frame(x=rep(1:3,each=4),y=99)
df%>%
group_by(x)%>%
mutate_(val=~purrr::map_dbl(function(x) sample(50,1)))
This didn't work.
But function with purrr only working:
1:5%>%purrr::map_dbl(function(x) sample(50,1))
[1] 39 30 7 18 45
Thanks for any help!
You don't need purrr:
df <- data.frame(x = rep(1:3, each = 4), y = 99)
df %>%
group_by(x) %>%
mutate(val = sample(50, n()))
Output
# A tibble: 12 x 3
# Groups: x [3]
x y val
<int> <dbl> <int>
1 1 99.0 10
2 1 99.0 25
3 1 99.0 2
4 1 99.0 24
5 2 99.0 48
6 2 99.0 19
7 2 99.0 34
8 2 99.0 33
9 3 99.0 24
10 3 99.0 14
11 3 99.0 37
12 3 99.0 12
If you need to use purrr, I guess you could do:
dplyr::mutate(df, val = purrr::map(x, ~ sample(50, 1)))
x y val
1 1 99 35
2 1 99 4
3 1 99 43
4 1 99 28
5 2 99 49
6 2 99 31
7 2 99 31
8 2 99 31
9 3 99 19
10 3 99 4
11 3 99 43
12 3 99 20
Or with the pipe:
library(dplyr)
library(purrr)
df %>%
mutate(val = map(x, ~ sample(50, 1)))
Data:
df <- data.frame(x = rep(1:3, each = 4), y = 99)

Resources