I'm trying to split columns into new rows keeping the data of the first two columns.
d1 <- data.frame(a=c(100,0,78),b=c(0,137,117),c.1=c(111,17,91), d.1=c(99,66,22), c.2=c(11,33,44), d.2=c(000,001,002))
d1
a b c.1 d.1 c.2 d.2
1 100 0 111 99 11 0
2 0 137 17 66 33 1
3 78 117 91 22 44 2
Expected results would be:
a b c d
1 100 0 111 99
2 100 0 11 0
3 0 137 17 66
4 0 137 33 1
5 78 117 91 22
6 78 117 44 2
Multiple tries with dplyr, but in sees is not the right approach.
If you want to stay in dplyr/tidyverse, you want tidyr::pivot_longer with a special reference to .value -- see the pivot vignette for more:
library(tidyverse)
d1 <- data.frame(
a = c(100, 0, 78),
b = c(0, 137, 117),
c.1 = c(111, 17, 91),
d.1 = c(99, 66, 22),
c.2 = c(11, 33, 44),
d.2 = c(000, 001, 002)
)
d1 %>%
pivot_longer(
cols = contains("."),
names_to = c(".value", "group"),
names_sep = "\\."
)
#> # A tibble: 6 x 5
#> a b group c d
#> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 100 0 1 111 99
#> 2 100 0 2 11 0
#> 3 0 137 1 17 66
#> 4 0 137 2 33 1
#> 5 78 117 1 91 22
#> 6 78 117 2 44 2
Created on 2020-05-11 by the reprex package (v0.3.0)
This could solve your issue:
#Try this
a1 <- d1[,c(1:4)]
a2 <- d1[,c(1,2,5,6)]
names(a1) <- names(a2) <- c('a','b','c','d')
DF <- rbind(a1,a2)
The posted answers are good, here's my attempt:
df <- data.frame(a=c(100,0,78),b=c(0,137,117),
c.1=c(111,17,91), d.1=c(99,66,22),
c.2=c(11,33,44), d.2=c(000,001,002))
# Make 2 pivot long operations
df_c <- df %>% select(-d.1, -d.2) %>%
pivot_longer(cols = c("c.1", "c.2"), values_to = "c") %>% select(-name)
df_d <- df %>% select(-c.1, -c.2) %>%
pivot_longer(cols=c("d.1","d.2"), values_to = "d") %>% select(-name)
# bind them without the "key" colums
bind_cols(df_c, select(df_d, -a, -b))
Which produces
# A tibble: 6 x 4
a b c d
<dbl> <dbl> <dbl> <dbl>
1 100 0 111 99
2 100 0 11 0
3 0 137 17 66
4 0 137 33 1
5 78 117 91 22
6 78 117 44 2
Related
Question:
Below works, but is there a better "R way" of achieving similar result? I am essentially trying to create / distribute groups into individual line items according to a user defined function (currently just using a loop).
Example:
df1 <- data.frame(group = c("A", "B", "C"),
volume = c(200L, 45L, 104L)
)
print(df1)
#> group volume
#> 1 A 200
#> 2 B 45
#> 3 C 104
I want the volume to be broken across multiple rows according to group so that the final result is a dataframe where the new volume (vol2 in the below) would add up to original volume above. In this example, I'm applying integer math with a divisor of 52, so my final result should be:
print(df3)
#> group vol2
#> 1 A 52
#> 2 A 52
#> 3 A 52
#> 4 A 44
#> 21 B 45
#> 31 C 52
#> 32 C 52
This works
The code below DOES get me to the desired result shown above:
div <- 52L
df1$intgr <- df1$volume %/% div
df1$remainder <- df1$volume %% div
print(df1)
#> group volume intgr remainder
#> 1 A 200 3 44
#> 2 B 45 0 45
#> 3 C 104 2 0
df2 <- data.frame()
for (r in 1:nrow(df1)){
if(df1[r,"intgr"] > 0){
for (k in 1:as.integer(df1[r,"intgr"])){
df1[r,"vol2"] <- div
df2 <- rbind(df2, df1[r,])
}
}
if(df1[r,"remainder"]>0){
df1[r, "vol2"] <- as.integer(df1[r, "remainder"])
df2 <- rbind(df2, df1[r,])
}
}
print(df2)
#> group volume intgr remainder vol2
#> 1 A 200 3 44 52
#> 2 A 200 3 44 52
#> 3 A 200 3 44 52
#> 4 A 200 3 44 44
#> 21 B 45 0 45 45
#> 31 C 104 2 0 52
#> 32 C 104 2 0 52
df3 <- subset(df2, select = c("group", "vol2"))
print(df3)
#> group vol2
#> 1 A 52
#> 2 A 52
#> 3 A 52
#> 4 A 44
#> 21 B 45
#> 31 C 52
#> 32 C 52
Being still relatively new to R, I'm just curious if someone knows a better way / function / method that gets to the same place. Seems like there might be. I could potentially have a more complex way of breaking up the rows and I was thinking maybe there's a method that applies a UDF to the dataframe to do something like this. I was searching for "expand group/groups" but was finding mostly "expand.grid" which isn't what I'm doing here.
Thank you for any suggestions!
A quick function to help split each number by the modulus,
fun <- function(num, mod) c(rep(mod, floor(num / mod)), (num-1) %% mod + 1)
fun(200, 52)
# [1] 52 52 52 44
fun(45, 52)
# [1] 45
fun(104, 52)
# [1] 52 52
And we can apply this a number of ways:
dplyr
library(dplyr)
df1 %>%
group_by(group) %>%
summarize(vol2 = fun(volume, 52), .groups = "drop")
# # A tibble: 7 x 2
# group vol2
# <chr> <dbl>
# 1 A 52
# 2 A 52
# 3 A 52
# 4 A 44
# 5 B 45
# 6 C 52
# 7 C 52
base R
do.call(rbind, by(df1, seq(nrow(df1)),
FUN = function(z) data.frame(group = z$group, vol2 = fun(z$volume, 52))))
data.table
library(data.table)
setDT(df1)
df1[, .(vol2 = fun(volume, 52)), by = group]
A tidyverse approach using purrr::pmap and tidyr::unnest_longer may look like so:
library(dplyr, w = FALSE)
library(tidyr)
library(purrr)
div <- 52
df1 |>
mutate(intgr = volume %/% div, remainder = volume %% div, intgr1 = +(remainder > 0)) |>
mutate(vol2 = purrr::pmap(list(intgr, intgr1, remainder), ~ c(rep(div, ..1), rep(..3, ..2)))) |>
tidyr::unnest_longer(vol2) |>
select(-intgr1)
#> # A tibble: 7 × 5
#> group volume intgr remainder vol2
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 A 200 3 44 52
#> 2 A 200 3 44 52
#> 3 A 200 3 44 52
#> 4 A 200 3 44 44
#> 5 B 45 0 45 45
#> 6 C 104 2 0 52
#> 7 C 104 2 0 52
With data.table and rep:
library(data.table)
setDT(df1)[, .(vol2 = c(rep(52, volume%/%52), (volume%%52)[sign(volume%%52)])), group]
#> group vol2
#> 1: A 52
#> 2: A 52
#> 3: A 52
#> 4: A 44
#> 5: B 45
#> 6: C 52
#> 7: C 52
Or
setDT(df1)[, .(vol2 = c(rep(52, volume%/%52), volume%%52)), group][vol2 != 0]
#> group vol2
#> 1: A 52
#> 2: A 52
#> 3: A 52
#> 4: A 44
#> 5: B 45
#> 6: C 52
#> 7: C 52
Vectorised and without grouping:
df1 <- data.frame(group = c("A", "B", "C"),
volume = c(200L, 45L, 104L))
n <- 52
idx <- df1$volume %/% n + ((sel <- df1$volume %% n) != 0)
out <- df1[rep(seq_len(nrow(df1)), idx),]
out$volume <- n
out$volume[cumsum(idx)[sel != 0]] <- sel[sel != 0]
## group volume
##1 A 52
##1.1 A 52
##1.2 A 52
##1.3 A 44
##2 B 45
##3 C 52
##3.1 C 52
Another base R solution using aggregate :
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1))
group volume
1 A 52, 52, 52, 44
2 B 45
3 C 52, 52, 52
This results in a list column for volume (could be useful)
To transform it to a long dataframe we can either use stack:
with(
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1)),
setNames(stack(setNames(volume,group))[2:1],names(df1))
)
group volume
1 A 52
2 A 52
3 A 52
4 A 44
5 B 45
6 C 52
7 C 52
8 C 52
Or alternatively use unnest from tidyr
library(tidyr)
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1)) %>% unnest(volume)
# A tibble: 8 × 2
group volume
<chr> <dbl>
1 A 52
2 A 52
3 A 52
4 A 44
5 B 45
6 C 52
7 C 52
8 C 52
I have the following dataset df.
For each id, a1-a3 are the values of variable a recorded at time points 1-3. b1-b3 are the values of variable b recorded at time 1-3. c is a time-invariant variable.
Here is the codes to create the dataset:
id <- c(1, 2, 3)
a1 <- c(52, 339, 83)
a2 <- c(86, 746, 35)
a3 <- c(46, 546, 45)
b1 <- c(84, 45, 83)
b2 <- c(55, 46, 35)
b3 <- c(46, 60, 45)
c <- c(30, 20, 50)
df <- cbind(id, a1, a2, a3, b1, b2, b3, c)
Here is original dataset df
id a1 a2 a3 b1 b2 b3 c
[1,] 1 52 86 46 84 55 46 30
[2,] 2 339 746 546 45 46 60 20
[3,] 3 83 35 45 83 35 45 50
I want to change it to the long format, i.e., into the following df2
time id a b c
[1,] 1 1 52 84 30
[2,] 2 1 86 55 30
[3,] 3 1 46 46 30
[4,] 1 2 339 45 20
[5,] 2 2 746 46 20
[6,] 3 2 546 60 20
[7,] 1 3 83 83 50
[8,] 2 3 35 35 50
[9,] 3 3 45 45 50
What is the best way to do that?
I tried pivot_longer Function (tidyr Package), but it does not return what I need.
Thank you very much for the help!
Here's a way to do it with a more advanced use of pivot_longer. A little harder to learn, but much less code:
df %>%
as.data.frame %>%
pivot_longer(-c(id, c), names_to = c('.value', 'time'), names_pattern = '(.)(.)') %>%
relocate(c, .after = b)
id time a b c
<dbl> <chr> <dbl> <dbl> <dbl>
1 1 1 52 84 30
2 1 2 86 55 30
3 1 3 46 46 30
4 2 1 339 45 20
5 2 2 746 46 20
6 2 3 546 60 20
7 3 1 83 83 50
8 3 2 35 35 50
9 3 3 45 45 50
Or, if you wanted to be a little more explicit about how the "time" and "c" columns are treated:
df %>%
as.data.frame %>%
pivot_longer(-id, names_to = c('.value', 'time'), names_pattern = '(.)(.*)') %>%
group_by(id) %>%
mutate(
time = as.numeric(time),
c = c[!is.na(c)]
) %>%
filter(!is.na(time))
id time a b c
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 52 84 30
2 1 2 86 55 30
3 1 3 46 46 30
4 2 1 339 45 20
5 2 2 746 46 20
6 2 3 546 60 20
7 3 1 83 83 50
8 3 2 35 35 50
9 3 3 45 45 50
Here is a way. After reshaping to long format, remove the digits in the name column, create a complementary id column, n, and reshape back to wide format.
id <- c(1, 2, 3)
a1 <- c(52, 339, 83)
a2 <- c(86, 746, 35)
a3 <- c(46, 546, 45)
b1 <- c(84, 45, 83)
b2 <- c(55, 46, 35)
b3 <- c(46, 60, 45)
c <- c(30, 20, 50)
df <- cbind(id, a1, a2, a3, b1, b2, b3, c)
df
#> id a1 a2 a3 b1 b2 b3 c
#> [1,] 1 52 86 46 84 55 46 30
#> [2,] 2 339 746 546 45 46 60 20
#> [3,] 3 83 35 45 83 35 45 50
suppressPackageStartupMessages({
library(dplyr)
library(tidyr)
})
df %>%
as.data.frame() %>%
pivot_longer(-id) %>%
mutate(name = gsub("\\d+", "", name)) %>%
group_by(id, name) %>%
mutate(n = row_number()) %>%
ungroup() %>%
pivot_wider(id_cols = c(id, n)) %>%
select(-n) %>%
mutate(c = zoo::na.locf(c))
#> # A tibble: 9 × 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 52 84 30
#> 2 1 86 55 30
#> 3 1 46 46 30
#> 4 2 339 45 20
#> 5 2 746 46 20
#> 6 2 546 60 20
#> 7 3 83 83 50
#> 8 3 35 35 50
#> 9 3 45 45 50
Created on 2022-10-23 with reprex v2.0.2
How can I transpose specific columns in a data.frame as:
id<- c(1,2,3)
t0<- c(0,0,0)
bp0<- c(88,95,79)
t1<- c(15,12,12)
bp1<- c(92,110,82)
t2<- c(25,30,20)
bp2<- c(75,99,88)
df1<- data.frame(id, t0, bp0, t1, bp1, t2, bp2)
df1
> df1
id t0 bp0 t1 bp1 t2 bp2
1 1 0 88 15 92 25 75
2 2 0 95 12 110 30 99
3 3 0 79 12 82 20 88
In order to obtain:
> df2
id t bp
1 1 0 88
2 2 0 95
3 3 0 79
4 1 15 92
5 2 12 110
6 3 12 82
7 1 25 75
8 2 30 99
9 3 20 88
In order to obtain df2, with represent t(t0,t1,t2) and bp(bp0,bp1,bp2) for the corresponding "id"
Using Base R, you can do:
Reprex
Code
df2 <- cbind(df1[1], stack(df1, startsWith(names(df1), "t"))[1], stack(df1,startsWith(names(df1), "bp"))[1])
names(df2)[2:3] <- c("t", "bp")
Output
df2
#> id t bp
#> 1 1 0 88
#> 2 2 0 95
#> 3 3 0 79
#> 4 1 15 92
#> 5 2 12 110
#> 6 3 12 82
#> 7 1 25 75
#> 8 2 30 99
#> 9 3 20 88
Created on 2022-02-14 by the reprex package (v2.0.1)
Here is solution with pivot_longer using name_pattern:
\\w+ = one or more alphabetic charachters
\\d+ = one or more digits
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer (
-id,
names_to = c(".value", "name"),
names_pattern = "(\\w+)(\\d+)"
) %>%
select(-name)
id t bp
<dbl> <dbl> <dbl>
1 1 0 88
2 1 15 92
3 1 25 75
4 2 0 95
5 2 12 110
6 2 30 99
7 3 0 79
8 3 12 82
9 3 20 88
A base R option using reshape
reshape(
setNames(df1, sub("(\\d+)", ".\\1", names(df1))),
direction = "long",
idvar = "id",
varying = -1
)
gives
id time t bp
1.0 1 0 0 88
2.0 2 0 0 95
3.0 3 0 0 79
1.1 1 1 15 92
2.1 2 1 12 110
3.1 3 1 12 82
1.2 1 2 25 75
2.2 2 2 30 99
3.2 3 2 20 88
I'm trying to pull values from columns based on the values in a vector. I'm not sure I have the right words to describe the problem, but the code should help.
This feels related to coalesce maybe not?
library(tidyverse)
# Starting table
dat <-
tibble(
A = 1:10,
B = 31:40,
C = 101:110,
value = c("A", "C", "B", "A", "B", "C", "C", "B", "A", "A")
)
I want:
dat %>%
mutate(
output = c(1, 102, 33, 4, 35, 106, 107, 38, 9, 10)
)
I could do
dat %>%
mutate(
output =
case_when(value == "A" ~ A,
value == "B" ~ B,
value == "C" ~ C)
)
but my real application has many values and I want to take advantage of value having the matching info
Is there a function that does:
dat %>%
mutate(output = grab_the_right_column(value))
Thanks!
The rowwise approach would be less efficient, but it is compact within the tidyverse approaches to get the column value based on the column name for each row.
library(dplyr)
dat %>%
rowwise %>%
mutate(output = get(value)) %>%
ungroup
-output
# A tibble: 10 x 5
# A B C value output
# <int> <int> <int> <chr> <int>
# 1 1 31 101 A 1
# 2 2 32 102 C 102
# 3 3 33 103 B 33
# 4 4 34 104 A 4
# 5 5 35 105 B 35
# 6 6 36 106 C 106
# 7 7 37 107 C 107
# 8 8 38 108 B 38
# 9 9 39 109 A 9
#10 10 40 110 A 10
These type of issues are more efficient with a row/column indexing approach from base R. Create a matrix of row sequence and the matching index of columns with the 'value' column and the column names to extract the element
dat$output <- as.data.frame(dat)[,1:3][cbind(seq_len(nrow(dat)), match(dat$value, names(dat)[1:3]))]
You can also use purrr and pmap():
library(dplyr)
library(purrr)
dat%>%mutate(output=
pmap(., ~{
v1<-c(...)
v1[names(v1)==v1[['value']]]
}
)%>%
as.numeric()%>%
unlist)
# A tibble: 10 x 5
A B C value output
<int> <int> <int> <chr> <dbl>
1 1 31 101 A 1
2 2 32 102 C 102
3 3 33 103 B 33
4 4 34 104 A 4
5 5 35 105 B 35
6 6 36 106 C 106
7 7 37 107 C 107
8 8 38 108 B 38
9 9 39 109 A 9
10 10 40 110 A 10
I'm trying to make column with sample value for each row of data
But I'm new with purrr and can't make this.
My code
df<-data.frame(x=rep(1:3,each=4),y=99)
df%>%
group_by(x)%>%
mutate_(val=~purrr::map_dbl(function(x) sample(50,1)))
This didn't work.
But function with purrr only working:
1:5%>%purrr::map_dbl(function(x) sample(50,1))
[1] 39 30 7 18 45
Thanks for any help!
You don't need purrr:
df <- data.frame(x = rep(1:3, each = 4), y = 99)
df %>%
group_by(x) %>%
mutate(val = sample(50, n()))
Output
# A tibble: 12 x 3
# Groups: x [3]
x y val
<int> <dbl> <int>
1 1 99.0 10
2 1 99.0 25
3 1 99.0 2
4 1 99.0 24
5 2 99.0 48
6 2 99.0 19
7 2 99.0 34
8 2 99.0 33
9 3 99.0 24
10 3 99.0 14
11 3 99.0 37
12 3 99.0 12
If you need to use purrr, I guess you could do:
dplyr::mutate(df, val = purrr::map(x, ~ sample(50, 1)))
x y val
1 1 99 35
2 1 99 4
3 1 99 43
4 1 99 28
5 2 99 49
6 2 99 31
7 2 99 31
8 2 99 31
9 3 99 19
10 3 99 4
11 3 99 43
12 3 99 20
Or with the pipe:
library(dplyr)
library(purrr)
df %>%
mutate(val = map(x, ~ sample(50, 1)))
Data:
df <- data.frame(x = rep(1:3, each = 4), y = 99)