How to use regex for ifelse tidyverse - r

My df is as followed
monday_A monday_B tuesday_A tuesday_B
1 2 4 100
6 7 8 5
I want to reorder this so it becomes
date Group quantitive
Monday A 1
Monday A 6
Monday B 2
Monday B 7
Tuesday A 4
Tuesday A 8
Tuesday B 100
Tuesday B 5
What i've done
df %>% pivot_longer(monday_A:tuesday_B, names_to="tempGroup", values_to="quantitive")
This made it
tempGroup quantitive
monday_A 1
monday_A 6
monday_B 2
monday_B 7
tuesday_A 4
tuesday_A 8
tuesday_B 100
tuesday_B 5
Now how do I separate tempgroup ? I think regex by ifelse could do it by separating the undercore

Use names_sep :
tidyr::pivot_longer(df, cols = everything(),
names_sep = "_",
names_to= c("date", "tempGroup"),
values_to="quantitative")
# A tibble: 8 x 3
# date tempGroup quantitative
# <chr> <chr> <int>
#1 monday A 1
#2 monday B 2
#3 tuesday A 4
#4 tuesday B 100
#5 monday A 6
#6 monday B 7
#7 tuesday A 8
#8 tuesday B 5
data
df <- structure(list(monday_A = c(1L, 6L), monday_B = c(2L, 7L),
tuesday_A = c(4L, 8L), tuesday_B = c(100L, 5L)),
class = "data.frame", row.names = c(NA, -2L))

Base R Solution:
# Transpose dataframe matrix: tpd => as.data.frame
tpd <- as.data.frame(t(df))
# Restructure the dataframe into the desired format: df_td => data.frame
df_td <-
data.frame(
day = gsub("_.*", "", rep(row.names(tpd), ncol(tpd))),
group = gsub(".*_", "", rep(row.names(tpd), ncol(tpd))),
quantitative = unlist(tpd),
row.names = NULL
)
Data
# Create re-usable data: df => data.frame
df <-
structure(
list(
monday_A = c(1L, 6L),
monday_B = c(2L, 7L),
tuesday_A = c(4L,
8L),
tuesday_B = c(100L, 5L)
),
row.names = c(NA,-2L),
class = "data.frame"
)

Related

R function for collapsing multiple ranges of different columns from wide to long format?

I've a dataset with multiple different ranges of columns in each row (each row corresponds to one individual), as below. Each instance of the different column types have 3 levels (0,1 and 2).
id col1_0 col1_1 col1_2 col2_0 col2_1 col2_2 col3_0 col3_1 col3_2
1 0 1 3 2 2 3 3 4 5
2 1 1 2 2 4 7 4 5 5
.
.
etc.
What I would need is to collapse all col1 into one column, all col2 into another and all col3's into another, for each id. As below.
id x col1 col2 col4
1 0 0 2 3
1 1 1 2 4
1 2 3 3 5
2 0 1 2 4
2 1 1 4 5
2 2 1 7 5
.
.
etc.
In addition, I would also need to create an x-column with values 0,1 and 2, for each id. However, I only manage to collapse the first range of columns (col1) with the code below.
library(tidyverse)
longer_data <- dataframe %>%
group_by(id) %>%
pivot_longer(col1_0:col1_2, names_to = "x1", values_to = "col1")
x1 here creates a column with the original column names. So I would create need an additional x-column that only keeps the last numbers of the original column names.
Is there a way to achieve this? Many thanks in advance!
We don't need any group_by. It can be directly done with pivot_longer by specifying the names_sep and the .value in names_to. Note the order of .value and x. It implies the values of that column should go into the each of those prefixes before the _ and the new column with suffix stub goes into 'x'
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -id, names_to = c('.value', 'x'), names_sep = "_")
-output
# A tibble: 6 x 5
# id x col1 col2 col3
# <int> <chr> <int> <int> <int>
#1 1 0 0 2 3
#2 1 1 1 2 4
#3 1 2 3 3 5
#4 2 0 1 2 4
#5 2 1 1 4 5
#6 2 2 2 7 5
data
df1 <- structure(list(id = 1:2, col1_0 = 0:1, col1_1 = c(1L, 1L), col1_2 = 3:2,
col2_0 = c(2L, 2L), col2_1 = c(2L, 4L), col2_2 = c(3L, 7L
), col3_0 = 3:4, col3_1 = 4:5, col3_2 = c(5L, 5L)),
class = "data.frame", row.names = c(NA,
-2L))
Here is a base R option using reshape, where timevar="x" creates a column named x, and sep="_" helps to fetch the last numbers of the original column names.
res <- reshape(
df,
direction = "long",
idvar = "id",
varying = -1,
timevar = "x",
sep = "_"
)
res <- res[order(res$id), ]
Output
> res
id x col1 col2 col3
1.0 1 0 0 2 3
1.1 1 1 1 2 4
1.2 1 2 3 3 5
2.0 2 0 1 2 4
2.1 2 1 1 4 5
2.2 2 2 2 7 5
Data
> dput(df)
structure(list(id = 1:2, col1_0 = 0:1, col1_1 = c(1L, 1L), col1_2 = 3:2,
col2_0 = c(2L, 2L), col2_1 = c(2L, 4L), col2_2 = c(3L, 7L
), col3_0 = 3:4, col3_1 = 4:5, col3_2 = c(5L, 5L)), class = "data.frame", row.names = c(NA,
-2L))

Map a function to two data frames of unequal lengths

For each row in df1 I would like to execute mult 10 times, once for each year in df2.
One option I can think of is to repeat df1 multiple times and join it to df2. But my actual data are much larger (~20k sections, 15 areas and 100 years), so I am looking for a more efficient way to do this.
# df1
section area a b c
1 1 1 0.1208916 0.7235306 0.7652636
2 2 1 0.8265642 0.2939602 0.6491496
3 1 2 0.9101611 0.7363248 0.1509295
4 2 2 0.8807047 0.5473221 0.6748055
5 1 3 0.2343558 0.2044689 0.9647333
6 2 3 0.4112479 0.9523639 0.1533197
----------
# df2
year d
1 1 0.7357432
2 2 0.4591575
3 3 0.3654561
4 4 0.1996439
5 5 0.2086226
6 6 0.5628826
7 7 0.4772953
8 8 0.8474007
9 9 0.8861693
10 10 0.6694851
mult <- function(a, b, c, d) {a * b * c * d}
The desired output would look something like this
section area year e
1 1 1 1 results of mult()
2 2 1 1 results of mult()
3 1 2 1 results of mult()
4 2 2 1 results of mult()
5 1 3 1 results of mult()
6 2 3 1 results of mult()
7 1 1 2 results of mult()
8 2 1 2 results of mult()
...
dput(df1)
structure(list(section = c(1L, 2L, 1L, 2L, 1L, 2L), area = c(1L,
1L, 2L, 2L, 3L, 3L), a = c(0.12089157756418, 0.826564211165532,
0.91016107192263, 0.880704707000405, 0.234355789143592, 0.411247851792723
), b = c(0.72353063733317, 0.293960151728243, 0.736324765253812,
0.547322086291388, 0.204468948533759, 0.952363904565573), c = c(0.765263637062162,
0.649149592733011, 0.150929539464414, 0.674805536167696, 0.964733332861215,
0.15331974090077)), out.attrs = list(dim = structure(2:3, .Names = c("section",
"area")), dimnames = list(section = c("section=1", "section=2"
), area = c("area=1", "area=2", "area=3"))), class = "data.frame", row.names = c(NA,
-6L))
dput(df2)
structure(list(year = 1:10, d = c(0.735743158031255, 0.459157506935298,
0.365456136409193, 0.199643932981417, 0.208622586680576, 0.562882597092539,
0.477295308141038, 0.847400720929727, 0.886169332079589, 0.669485098216683
)), class = "data.frame", row.names = c(NA, -10L))
Edit: full sized toy dataset
library(dplyr)
df1 <- expand.grid(section = 1:20000,
area = 1:15) %>%
mutate(a = runif(300000),
b = runif(300000),
c = runif(300000))
df2 <- data.frame(year = 1:100,
d = runif(100))
You can use crossing to create combinations of df1 and df2 and apply mult to them.
tidyr::crossing(df1, df2) %>% dplyr::mutate(e = mult(a, b, c, d))

How to use column indices to collect values from columns in R

x y z column_indices
6 7 1 1,2
5 4 2 3
1 3 2 1,3
I have the column indices of the values I would like to collect in a separate column like so, what I want to create is something like this:
x y z column_indices values
6 7 1 1,2 6,7
5 4 2 3 2
1 3 2 1,3 1,2
What is the simplest way to do this in R?
Thanks!
In base R, we can use apply, split the column_indices on ',', convert them to integer and get the corresponding value from the row.
df$values <- apply(df, 1, function(x) {
inds <- as.integer(strsplit(x[4], ',')[[1]])
toString(x[inds])
})
df
# x y z column_indices values
#1 6 7 1 1,2 6, 7
#2 5 4 2 3 2
#3 1 3 2 1,3 1, 2
data
df <- structure(list(x = c(6L, 5L, 1L), y = c(7L, 4L, 3L), z = c(1L,
2L, 2L), column_indices = structure(c(1L, 3L, 2L), .Label = c("1,2",
"1,3", "3"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
One solution involving dplyr and tidyr could be:
df %>%
pivot_longer(-column_indices) %>%
group_by(column_indices) %>%
mutate(values = toString(value[1:n() %in% unlist(strsplit(column_indices, ","))])) %>%
pivot_wider(names_from = "name", values_from = "value")
column_indices values x y z
<chr> <chr> <int> <int> <int>
1 1,2 6, 7 6 7 1
2 3 2 5 4 2
3 1,3 1, 2 1 3 2

Select rows based in rows of another data.frame

I have these following data.frames:
dt1
Id Mother Weight
1 elly 10
2 bina 20
3 sirce 30
4 tina 30
5 lina 40
and
dt2
Id Mother Weight sex
1 elly 10 M
2 bina 20 F
3 sirce 30 F
And I would like select rows from DT1 (ID) based in DT2 (ID), this way:
new.dt
Id Mother Weight sex
4 tina 30 NA
5 lina 40 NA
Here is one option with anti_join
library(dplyr)
anti_join(dt1 %>%
mutate(sex = NA), dt2, by = 'Id')
# Id Mother Weight sex
#1 4 tina 30 NA
#2 5 lina 40 NA
data
dt1 <- structure(list(Id = 1:5, Mother = c("elly", "bina", "sirce",
"tina", "lina"), Weight = c(10L, 20L, 30L, 30L, 40L)),
class = "data.frame", row.names = c(NA,
-5L))
dt2 <- structure(list(Id = 1:3, Mother = c("elly", "bina", "sirce"),
Weight = c(10L, 20L, 30L), sex = c("M", "F", "F")),
class = "data.frame", row.names = c(NA,
-3L))
transform(dt1[!dt1$Id %in% dt2$Id,], sex = NA)
# Id Mother Weight sex
#4 4 tina 30 NA
#5 5 lina 40 NA
d = merge(dt1, dt2, all = TRUE)
d[is.na(d$sex),]
# Id Mother Weight sex
#4 4 tina 30 <NA>
#5 5 lina 40 <NA>

How to append group row into dataframe

I have this df1:
A B C
1 2 3
5 7 9
where A B C are columns names.
I have another df2 with one column:
A
1
2
3
4
I would like to append df2 for each column of df1, creating this final dataframe:
A B C
1 2 3
5 7 9
1 1 1
2 2 2
3 3 3
4 4 4
is it possible to do it?
data.frame(sapply(df1, c, unlist(df2)), row.names = NULL)
# A B C
#1 1 2 3
#2 5 7 9
#3 1 1 1
#4 2 2 2
#5 3 3 3
#6 4 4 4
DATA
df1 = structure(list(A = c(1L, 5L), B = c(2L, 7L), C = c(3L, 9L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -2L))
df2 = structure(list(A = 1:4), .Names = "A", class = "data.frame", row.names = c(NA,
-4L))
We can replicate df2 for the number of columns of df1, unname it, then rbind it.
rbind(df1, unname(rep(df2, ncol(df1))))
# A B C
# 1 1 2 3
# 2 5 7 9
# 3 1 1 1
# 4 2 2 2
# 5 3 3 3
# 6 4 4 4
Data:
df1 <- structure(list(A = c(1L, 5L), B = c(2L, 7L), C = c(3L, 9L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(A = 1:4), .Names = "A", row.names = c(NA, -4L), class = "data.frame")
We can use base R methods
rbind(df1, setNames(as.data.frame(do.call(cbind, rep(list(df2$A), 3))), names(df1)))
# A B C
#1 1 2 3
#2 5 7 9
#3 1 1 1
#4 2 2 2
#5 3 3 3
#6 4 4 4
data
df1 <- structure(list(A = c(1L, 5L), B = c(2L, 7L), C = c(3L, 9L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(A = 1:4), .Names = "A", class = "data.frame",
row.names = c(NA, -4L))
Here is a base R method with rbind, rep, and setNames:
rbind(dat, setNames(data.frame(rep(dat1, ncol(dat))), names(dat)))
A B C
1 1 2 3
2 5 7 9
3 1 1 1
4 2 2 2
5 3 3 3
6 4 4 4
Edit: turns outdata.frame isn't necessary:
rbind(dat, setNames(rep(dat1, ncol(dat)), names(dat)))
will work.
data
dat <-
structure(list(A = c(1L, 5L), B = c(2L, 7L), C = c(3L, 9L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -2L))
dat1 <-
structure(list(A = 1:4), .Names = "A", row.names = c(NA, -4L),
class = "data.frame")
I just love R, here is yet another Base R solution but with mapply:
data.frame(mapply(c, df1, df2))
Result:
A B C
1 1 2 3
2 5 7 9
3 1 1 1
4 2 2 2
5 3 3 3
6 4 4 4
Note:
No need to deal with colnames like almost all the other solutions... The key to why this works is that "mapply calls FUN for the values of ... [each element]
(re-cycled to the length of the longest...[element]" (See ?mapply). In other words, df2$A is recycled to however many columns df1 has.
Data:
df1 = structure(list(A = c(1L, 5L), B = c(2L, 7L), C = c(3L, 9L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -2L))
df2 = structure(list(A = 1:4), .Names = "A", row.names = c(NA, -4L), class = "data.frame")
Data:
df1 <- data.frame(A=c(1,5),
B=c(2,7),
C=c(3,9))
df2 <- data.frame(A=c(1,2,3,4))
Solution:
df2 <- matrix(rep(df2$A, ncol(df1)), ncol=ncol(df1))
colnames(df2) <- colnames(df1)
rbind(df1,df2)
Result:
A B C
1 1 2 3
2 5 7 9
3 1 1 1
4 2 2 2
5 3 3 3
6 4 4 4
A solution from purrr, which uses map_dfc to loop through all columns in df1 to combine all the elements with df2$A.
library(purrr)
map_dfc(df1, ~c(., df2$A))
# A tibble: 6 x 3
A B C
<int> <int> <int>
1 1 2 3
2 5 7 9
3 1 1 1
4 2 2 2
5 3 3 3
6 4 4 4
Data
df1 <- structure(list(A = c(1L, 5L), B = c(2L, 7L), C = c(3L, 9L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(A = 1:4), .Names = "A", class = "data.frame",
row.names = c(NA, -4L))
By analogy with #useR's excellent Base R answer, here's a tidyverse solution:
library(purrr)
map2_df(df1, df2, c)
A B C
1 1 2 3
2 5 7 9
3 1 1 1
4 2 2 2
5 3 3 3
6 4 4 4
Here are a few other (less desirable) options from when I first answered this question.
library(dplyr)
bind_rows(df1, df2 %>% mutate(B=A, C=A))
Or, if we want to dynamically get the number of columns and their names from df1:
bind_rows(df1,
df2[,rep(1,ncol(df1))] %>% setNames(names(df1)))
And one more Base R method:
rbind(df1, setNames(df2[,rep(1,ncol(df1))], names(df1)))
For the sake of completeness, here is data.table approach which doesn't require to handle column names:
library(data.table)
setDT(df1)[, lapply(.SD, c, df2$A)]
A B C
1: 1 2 3
2: 5 7 9
3: 1 1 1
4: 2 2 2
5: 3 3 3
6: 4 4 4
Note that the OP has described df2 to consist only of one column.
There is also a base R version of this approach:
data.frame(lapply(df1, c, df2$A))
A B C
1 1 2 3
2 5 7 9
3 1 1 1
4 2 2 2
5 3 3 3
6 4 4 4
This is similar to d.b's approach but doesn't required to deal with column names.

Resources